library(dlnm)
library(dplyr)

# 1. Main approach --------

# Model 1 -------

alri5_gam_1_mod <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         s(cf, k=4, fx=T) +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 
                         
                         # obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 
                         
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         
                         as.factor(canton_id_universal) +
                         as.factor(period)
                       ,
                       data=full_df_1_x,
                       family = quasipoisson())


plot(alri5_gam_1_mod)

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_x$cf,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)



## model using crossbasis

alri5_gam_1_mod_cb <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         ad.cb +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 
                         
                         # obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 
                         
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         
                         as.factor(canton_id_universal) +
                         as.factor(period)
                       ,
                       data=full_df_1_x,
                       family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
  ) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_main <- ggplot(table_name_mean, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"),
             label.size = NA, fill = "white") +
  geom_segment(aes(x=0.45, xend=0.45, y=1.27, yend=1.32),
               color="black") +
  geom_segment(aes(x=0.55, xend=0.55, y=1.21, yend=1.26),
               color="black") +
  geom_segment(aes(x=0.46, xend=0.54, y=1.31, yend=1.26),
               color="black",
               arrow = arrow(length = unit(2, "mm"))) +
  annotate("text", x=0.43, y=1.35, label = "45%", size=4) +
  annotate("text", x=.575, y=1.27, label = "55%", size=4) +
  annotate("text", x=0.50, y=1.58, label = "MRR: 0.94\n(95% CI, 0.86-1.04)") +
  geom_segment(aes(x=0.75, xend=0.75, y=.92, yend=.97),
             color="black") +
  geom_segment(aes(x=.85, xend=.85, y=.76, yend=.81),
               color="black") +
  geom_segment(aes(x=0.76, xend=0.84, y=.96, yend=.83),
               color="black",
               arrow = arrow(length = unit(2, "mm"))) +
  annotate("text", x=0.74, y=.85, label = "75%", size=4) +
  annotate("text", x=0.86, y=.69, label = "85%", size=4) +
  annotate("text", x=0.80, y=1.13, label = "MRR: 0.82\n(95% CI, 0.79-0.85)") +
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to the Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 


crossbasis_fig_main



crossbasis_fig_main_histogram <- cowplot::plot_grid(
  crossbasis_fig_main,
  histogram,
  align="hv",
  rel_heights = c(1, 0.5),
  nrow=2
)


# cowplot::ggsave2("~/Desktop/Figure2_Gould_EHP_2022.pdf",
#                  crossbasis_fig_main_histogram,
#                  width = 225,
#                  height = 150,
#                  units = "mm",
#                  dpi = 600)


crossbasis_fig_empty <- ggplot(table_name_mean, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) +
  
 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  # geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
  #            label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to the Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 

histogram




# crossbasis_fig_empty_histogram

# Relative to the 45% ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = 0.1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_45 <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_45 <- ggplot(table_name_45, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
             label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to 45% CF") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 

# Relative to 75% ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = 0.1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_75 <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_75 <- ggplot(table_name_75, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
             label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to 75% CF") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 



# Relative to 20% ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = 0.1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.20 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_20 <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_20 <- ggplot(table_name_20, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
             label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to 20% CF") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 



# Relative to 100% ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = 0.1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 1 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_100 <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_100 <- ggplot(table_name_100, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
             label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  # scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
  #                             1.2, 1.4,  1.6, 1.8),
  #                    minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to 100% CF") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 


# Model 0: Empty model -------

alri5_gam_empty_mod.cb <- gam(ALRI5_5_plus ~
                             offset(log(under5population)) + 
                             
                             ad.cb +
                             
                             as.factor(canton_id_universal) +
                             as.factor(period)
                           ,
                           data=full_df_1,
                           family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_empty_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model0 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 2 --------

alri5_gam_2_mod.cb <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         ad.cb +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 
                         
                         # obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 
                         
                         # Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use+
                         
                         as.factor(canton_id_universal) +
                         as.factor(period)
                       
                       ,
                       data=full_df_1,
                       family = quasipoisson())
# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_2_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model2 <- as.data.frame(bind_cols(fit, lci, uci))

# Model 3 --------
alri5_gam_3_mod.cb <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         ad.cb +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 
                         
                         obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         # elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 
                         
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         
                         as.factor(canton_id_universal) +
                         as.factor(period)
                       
                       ,
                       data=full_df_1,
                       family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_3_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model3 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 4 --------
alri5_gam_4_mod.cb <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         ad.cb +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 
                         
                         obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         # elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 
                         
                         # Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         
                         as.factor(canton_id_universal) +
                         as.factor(period)
                       
                       ,
                       data=full_df_1,
                       family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model4 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 5 --------
alri5_gam_5_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            mage_mean + 
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1_x,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_5_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model5 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 1.2 Combine alternative specifications for figure ------

alternative_model_cb_estimates_df <- table_name_mean %>% 
  mutate(model = "Model 1:\nPreferred specification") %>% 
  bind_rows(table_name_mean.model0 %>% 
              mutate(model = "Model 0:\nEmpty model")) %>% 
  bind_rows(table_name_mean.model2 %>% 
              mutate(model = "Model 2:\nModel 1 minus adult\nfemale literacy")) %>% 
  bind_rows(table_name_mean.model3 %>% 
              mutate(model = "Model 3:\nModel 1 replace solid waste removal\nwith piped water")) %>% 
  bind_rows(table_name_mean.model4 %>% 
              mutate(model = "Model 4:\nModel 3 minus adult\nfemale literacy")) %>% 
  bind_rows(table_name_mean.model5 %>% 
              mutate(model = "Model 5:\nModel 1 plus mean\nage of mother")) %>% 
  dplyr::rename(cf = CF,
                fit = rr_lag0,
                fit.low = lci_lag0,
                fit.high = uci_lag0)

splines_specifications_plot <- ggplot() + 
  geom_ribbon(data=alternative_model_cb_estimates_df,
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=alternative_model_cb_estimates_df,
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  # scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
  #                             1.2, 1.4,  1.6, 1.8),
  #                    minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  coord_cartesian(ylim=c(0.4,4)) +
  ggtitle("Alternative specifications", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(plot.margin = margin(t = 2.5, r=2.5, l=2.5, b=-2.5, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        plot.title = element_blank(), 
        legend.text = element_text(size=11, family="Helvetica"),
        axis.title.y = element_text(size=12, family = "Helvetica", face="bold"))  

histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(alpha=0.8, color="black", bins = 50, fill="dodgerblue4") + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(plot.margin = margin(t = -5, r=2.5, l=2.5, b=2.5, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        legend.title = element_blank(),
        legend.position = "none",
        plot.title = element_blank(),
        axis.title = element_text(size=12, family = "Helvetica", face="bold"))  

splines_specifications_histogram_plot <- cowplot::plot_grid(
  splines_specifications_plot,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.7)
)


splines_specifications_histogram_plot

specifications_figure_full <- cowplot::plot_grid(
  alternative_specifications_glm,
  splines_specifications_histogram_plot,
  rel_heights = c(1, 1.2),
  nrow=2
  
)

cowplot::ggsave2("~/BurkeLab Dropbox/Carlos Gould/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/specifications_figure_full_Aug2022.png",
                 specifications_figure_full,
                 width = 325,
                 height = 225,
                 units = "mm",
                 dpi = 275
)

# 1.3 Extra degree of freedom for splines -------

# 1.3.1 DF = 4 --------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1$cf,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 4)) # I use the df I found in the test model (above)



# Model 0: Empty model -------

alri5_gam_empty_mod.cb <- gam(ALRI5_5_plus ~
                                offset(log(under5population)) + 
                                
                                ad.cb +
                                
                                as.factor(canton_id_universal) +
                                as.factor(period)
                              ,
                              data=full_df_1,
                              family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_empty_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model0_df4 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 1 --------

alri5_gam_1_mod_cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_df4 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 2 --------

alri5_gam_2_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use+
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1,
                          family = quasipoisson())
# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_2_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model2_df4 <- as.data.frame(bind_cols(fit, lci, uci))

# Model 3 --------
alri5_gam_3_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_3_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model3_df4 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 4 --------
alri5_gam_4_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model4_df4 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 5 --------
alri5_gam_5_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            mage_mean + 
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1_x,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_5_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model5_df4 <- as.data.frame(bind_cols(fit, lci, uci)) 


# Combine alternative specifications for figure ------

alternative_df4_model_cb_estimates_df <- table_name_mean_df4 %>% 
  mutate(model = "Model 1:\nPreferred specification") %>% 
  bind_rows(table_name_mean.model0_df4 %>% 
              mutate(model = "Model 0:\nEmpty model")) %>% 
  bind_rows(table_name_mean.model2_df4 %>% 
              mutate(model = "Model 2:\nModel 1 minus adult\nfemale literacy")) %>% 
  bind_rows(table_name_mean.model3_df4 %>% 
              mutate(model = "Model 3:\nModel 1 replace solid waste removal\nwith piped water")) %>% 
  bind_rows(table_name_mean.model4_df4 %>% 
              mutate(model = "Model 4:\nModel 3 minus adult\nfemale literacy")) %>% 
  bind_rows(table_name_mean.model5_df4 %>% 
              mutate(model = "Model 5:\nModel 1 plus mean\nage of mother")) %>% 
  dplyr::rename(cf = CF,
                fit = rr_lag0,
                fit.low = lci_lag0,
                fit.high = uci_lag0)

splines_df4_specifications_plot <- ggplot() + 
  geom_ribbon(data=alternative_df4_model_cb_estimates_df,
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=alternative_df4_model_cb_estimates_df,
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  # scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
  #                             1.2, 1.4,  1.6, 1.8),
  #                    minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  coord_cartesian(ylim=c(0.4,4)) +
  ggtitle("Alternative specifications, 4 degrees of freedom", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(plot.margin = margin(t = 2.5, r=2.5, l=2.5, b=0, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "top",
        legend.title = element_blank(),
        plot.title = element_text(size=14, family="Helvetica", face="bold"), 
        legend.text = element_text(size=11, family="Helvetica"),
        axis.title.y = element_text(size=12, family = "Helvetica", face="bold"))  

histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(alpha=0.8, color="black", bins = 50, fill="dodgerblue4") + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(plot.margin = margin(t = 0, r=2.5, l=2.5, b=2.5, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        legend.title = element_blank(),
        legend.position = "none",
        plot.title = element_blank(),
        axis.title = element_text(size=12, family = "Helvetica", face="bold"))  

# splines_df4_specifications_histogram_plot <- cowplot::plot_grid(
#   splines_df4_specifications_plot,
#   histogram,
#   align="hv",
#   nrow=2,
#   rel_heights = c(1, 0.6)
# )
# 
# 
# cowplot::ggsave2("~/Dropbox/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/specifications_df4_figure_full_Sept2021.png",
#                  splines_df4_specifications_histogram_plot,
#                  width = 325,
#                  height = 175,
#                  units = "mm",
#                  dpi = 275
)

# 1.3.2 DF = 5 --------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1$cf,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 5)) # I use the df I found in the test model (above)



# Model 0: Empty model -------

alri5_gam_empty_mod.cb <- gam(ALRI5_5_plus ~
                                offset(log(under5population)) + 
                                
                                ad.cb +
                                
                                as.factor(canton_id_universal) +
                                as.factor(period)
                              ,
                              data=full_df_1,
                              family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_empty_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model0_df5 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 1 --------

alri5_gam_1_mod_cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_df5 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 2 --------

alri5_gam_2_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use+
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1,
                          family = quasipoisson())
# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_2_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model2_df5 <- as.data.frame(bind_cols(fit, lci, uci))

# Model 3 --------
alri5_gam_3_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_3_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model3_df5 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Model 4 --------
alri5_gam_4_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model4_df5 <- as.data.frame(bind_cols(fit, lci, uci)) 


# Model 5 --------
alri5_gam_5_mod.cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            # elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            # Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            mage_mean + 
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          
                          ,
                          data=full_df_1_x,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_5_mod.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean.model5_df5 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Combine alternative specifications for figure ------

alternative_df5_model_cb_estimates_df <- table_name_mean_df5 %>% 
  mutate(model = "Model 1:\nPreferred specification") %>% 
  bind_rows(table_name_mean.model0_df5 %>% 
              mutate(model = "Model 0:\nEmpty model")) %>% 
  bind_rows(table_name_mean.model2_df5 %>% 
              mutate(model = "Model 2:\nModel 1 minus adult\nfemale literacy")) %>% 
  bind_rows(table_name_mean.model3_df5 %>% 
              mutate(model = "Model 3:\nModel 1 replace solid waste removal\nwith piped water")) %>% 
  bind_rows(table_name_mean.model4_df5 %>% 
              mutate(model = "Model 4:\nModel 3 minus adult\nfemale literacy")) %>% 
  bind_rows(table_name_mean.model5_df5 %>% 
              mutate(model = "Model 5:\nModel 1 plus mean\nage of mother")) %>% 
  dplyr::rename(cf = CF,
                fit = rr_lag0,
                fit.low = lci_lag0,
                fit.high = uci_lag0)

splines_df5_specifications_plot <- ggplot() + 
  geom_ribbon(data=alternative_df5_model_cb_estimates_df,
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=alternative_df5_model_cb_estimates_df,
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  # scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
  #                             1.2, 1.4,  1.6, 1.8),
  #                    minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  coord_cartesian(ylim=c(0.4,4)) +
  ggtitle("Alternative specifications, 5 degrees of freedom", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(plot.margin = margin(t = 2.5, r=2.5, l=2.5, b=0, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        plot.title = element_text(size=14, family="Helvetica", face="bold"), 
        plot.subtitle = element_blank(),
        legend.text = element_text(size=11, family="Helvetica"),
        axis.title.y = element_text(size=12, family = "Helvetica", face="bold"))  


splines_df45_specifications_histogram_plot <- cowplot::plot_grid(
  splines_df4_specifications_plot,
  splines_df5_specifications_plot,
  histogram,
  align="hv",
  nrow=3,
  rel_heights = c(1, 0.8, 0.6)
)


cowplot::ggsave2("~/BurkeLab Dropbox/Carlos Gould/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/specifications_df45_figure_full_Aug2022.png",
                 splines_df45_specifications_histogram_plot,
                 width = 325,
                 height = 250,
                 units = "mm",
                 dpi = 275
)

# 2. Effect modification ---------

# 2.1 Period specific -------

alri5_glm_1_1990 <- glm(ALRI5_5_plus ~
                          offset(log(under5population)) + 
                          
                          cf10 +
                          
                          percent_rural_corrected_ratio + 
                          electricidad_no + 
                          
                          materials_index + 
                          elimina_servidas_red_pozo_letrina + 
                          Literate_women + 
                          InSchool_girls + 
                          Idioma_indigena_self + 
                          
                          pcv3_use + 
                          vaccine_index + 
                          
                          ANC_use +  
                          ANC_visits_median_use 
                        ,
                        data=full_df_1 %>% 
                          filter(period=="1990"),
                        family = quasipoisson())

alri5_glm_1_2001 <- glm(ALRI5_5_plus ~
                          offset(log(under5population)) + 
                          
                          cf10 +
                          
                          percent_rural_corrected_ratio + 
                          electricidad_no + 
                          
                          materials_index + 
                          elimina_servidas_red_pozo_letrina + 
                          Literate_women + 
                          InSchool_girls + 
                          Idioma_indigena_self + 
                          
                          pcv3_use + 
                          vaccine_index + 
                          
                          ANC_use +  
                          ANC_visits_median_use 
                        ,
                        data=full_df_1 %>% 
                          filter(period=="2001"),
                        family = quasipoisson())


alri5_glm_1_2010 <- glm(ALRI5_5_plus ~
                          offset(log(under5population)) + 
                          
                          cf10 +
                          
                          percent_rural_corrected_ratio + 
                          electricidad_no + 
                          
                          materials_index + 
                          elimina_servidas_red_pozo_letrina + 
                          Literate_women + 
                          InSchool_girls + 
                          Idioma_indigena_self + 
                          
                          pcv3_use + 
                          vaccine_index + 
                          
                          ANC_use +  
                          ANC_visits_median_use 
                        ,
                        data=full_df_1 %>% 
                          filter(period=="2010"),
                        family = quasipoisson())


alri5_glm_1_2015 <- glm(ALRI5_5_plus ~
                          offset(log(under5population)) + 
                          
                          cf10 +
                          
                          percent_rural_corrected_ratio + 
                          electricidad_no + 
                          
                          materials_index + 
                          elimina_servidas_red_pozo_letrina + 
                          Literate_women + 
                          InSchool_girls + 
                          Idioma_indigena_self + 
                          
                          pcv3_use + 
                          vaccine_index + 
                          
                          ANC_use +  
                          ANC_visits_median_use 
                        ,
                        data=full_df_1 %>% 
                          filter(period=="2015-2019"),
                        family = quasipoisson())

# Assimilate GLM estimates -------

alri5_glm_period_estimates <- tidy(alri5_glm_1_mod)[2,] %>% 
  mutate(model = "Full model") %>% 
  bind_rows(tidy(alri5_glm_1_1990)[2,] %>% 
              mutate(model = "Period 1: 1988-1992")) %>% 
  bind_rows(tidy(alri5_glm_1_2001)[2,] %>% 
              mutate(model = "Period 2: 1999-2003")) %>% 
  bind_rows(tidy(alri5_glm_1_2010)[2,] %>% 
              mutate(model = "Period 3: 2008-2012")) %>% 
  bind_rows(tidy(alri5_glm_1_2015)[2,] %>% 
              mutate(model = "Period 4: 2015-2019")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))) %>% 
  filter(model != "Full model")

# Make figure ---------

period_glm_fig <- 
  ggplot() +
  geom_interval(data=alri5_glm_period_estimates, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50) +
  geom_interval(data=alri5_glm_period_estimates, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66) +
  geom_interval(data=alri5_glm_period_estimates, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95) +  
  geom_point(data=alri5_glm_period_estimates, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  ggsci::scale_color_lancet() + 
  
  ggtitle("Period subsets",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=10, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=12, face="bold"),
        legend.position = "top",
        legend.text = element_text(size=12, color="black"),
        legend.title =  element_blank())


# 2.1.2 GAMMS -------

# 1990 --------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="1990") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)



alri5_gam_4_1990.cb <- mgcv::gam(ALRI5_5_plus ~  
                                offset(log(under5population)) + 
                                
                                ad.cb +
                                percent_rural_corrected_ratio + 
                                electricidad_no + 
                                
                                # material_techo_nicer_all + 
                                # material_techo_hormigon_losa_cemento_use + 
                                # material_pared_hormigon_bloque_ladrillo + 
                                # material_piso_nicer + 
                                # material_piso_tierra + 
                                materials_index + 
                                
                                # obtiene_agua_redpublica_tuberia_adentro_use + 
                                # servicio_higenico_redpublica_use + 
                                # elimina_servidas_red_pozo + 
                                # elimina_servidas_red_pozo_letrina + 
                                # servicio_ducha_excluso + 
                                # elimina_basura_service + 
                                hygiene_index + 
                                
                                # Literate_women + 
                                # Literate + 
                                # InSchool_girls + 
                                Idioma_indigena_self + 
                                
                                BCG_use + 
                                # DPT3_use + 
                                SAR_use + 
                                # Polio_use + 
                                pcv3_use + 
                                # vaccine_index + 
                                
                                ANC_use +  
                                ANC_visits_median_use 
                              # as.factor(period) + 
                              # as.factor(canton_id_universal)
                              ,
                              data=full_df_1 %>% 
                                filter(period=="1990"),
                              family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_1990.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="1990") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_1990 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2001 -------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2001") %>% dplyr::select(cf) %>% unlist,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_4_2001.cb <- mgcv::gam(ALRI5_5_plus ~  
                                offset(log(under5population)) + 
                                
                                ad.cb +
                                percent_rural_corrected_ratio + 
                                electricidad_no + 
                                
                                # material_techo_nicer_all + 
                                # material_techo_hormigon_losa_cemento_use + 
                                # material_pared_hormigon_bloque_ladrillo + 
                                # material_piso_nicer + 
                                # material_piso_tierra + 
                                materials_index + 
                                
                                # obtiene_agua_redpublica_tuberia_adentro_use + 
                                # servicio_higenico_redpublica_use + 
                                # elimina_servidas_red_pozo + 
                                # elimina_servidas_red_pozo_letrina + 
                                # servicio_ducha_excluso + 
                                # elimina_basura_service + 
                                hygiene_index + 
                                
                                # Literate_women + 
                                # Literate + 
                                # InSchool_girls + 
                                Idioma_indigena_self + 
                                
                                BCG_use + 
                                # DPT3_use + 
                                SAR_use + 
                                # Polio_use + 
                                pcv3_use + 
                                # vaccine_index + 
                                
                                ANC_use +  
                                ANC_visits_median_use 
                              # as.factor(period) + 
                              # as.factor(canton_id_universal)
                              ,
                              data=full_df_1 %>% 
                                filter(period=="2001"),
                              family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2001.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2001") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2001 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2010 -------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2010") %>% dplyr::select(cf) %>% unlist,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_4_2010.cb <- mgcv::gam(ALRI5_5_plus ~  
                                offset(log(under5population)) + 
                                
                                ad.cb +
                                percent_rural_corrected_ratio + 
                                electricidad_no + 
                                
                                # material_techo_nicer_all + 
                                # material_techo_hormigon_losa_cemento_use + 
                                # material_pared_hormigon_bloque_ladrillo + 
                                # material_piso_nicer + 
                                # material_piso_tierra + 
                                materials_index + 
                                
                                # obtiene_agua_redpublica_tuberia_adentro_use + 
                                # servicio_higenico_redpublica_use + 
                                # elimina_servidas_red_pozo + 
                                # elimina_servidas_red_pozo_letrina + 
                                # servicio_ducha_excluso + 
                                # elimina_basura_service + 
                                hygiene_index + 
                                
                                # Literate_women + 
                                # Literate + 
                                # InSchool_girls + 
                                Idioma_indigena_self + 
                                
                                BCG_use + 
                                # DPT3_use + 
                                SAR_use + 
                                # Polio_use + 
                                pcv3_use + 
                                # vaccine_index + 
                                
                                ANC_use +  
                                ANC_visits_median_use 
                              # as.factor(period) + 
                              # as.factor(canton_id_universal)
                              ,
                              data=full_df_1 %>% 
                                filter(period=="2010"),
                              family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2010.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2010") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2010 <- as.data.frame(bind_cols(fit, lci, uci)) 


# 2015-2019 -------


ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2015-2019") %>% dplyr::select(cf) %>% unlist,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_4_2015.cb <- gam(ALRI5_5_plus ~  
                          offset(log(under5population)) + 
                          
                          ad.cb +
                          percent_rural_corrected_ratio + 
                          electricidad_no + 
                          
                          # material_techo_nicer_all + 
                          # material_techo_hormigon_losa_cemento_use + 
                          # material_pared_hormigon_bloque_ladrillo + 
                          # material_piso_nicer + 
                          # material_piso_tierra + 
                          materials_index + 
                          
                          # obtiene_agua_redpublica_tuberia_adentro_use + 
                          # servicio_higenico_redpublica_use + 
                          # elimina_servidas_red_pozo + 
                          # elimina_servidas_red_pozo_letrina + 
                          # servicio_ducha_excluso + 
                          # elimina_basura_service + 
                          hygiene_index + 
                          
                          # Literate_women + 
                          # Literate + 
                          # InSchool_girls + 
                          Idioma_indigena_self + 
                          
                          BCG_use + 
                          # DPT3_use + 
                          SAR_use + 
                          # Polio_use + 
                          pcv3_use + 
                          # vaccine_index + 
                          
                          ANC_use +  
                          ANC_visits_median_use 
                        # as.factor(period) + 
                        # as.factor(canton_id_universal)
                        ,
                        data=full_df_1 %>% 
                          filter(period=="2015-2019"),
                        family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2015.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2015-2019") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2015 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2.1.2 Combine and plot --------

period_cb_estimates_df <- table_name_mean %>% 
  mutate(model = "Full model",
         full = "Yes") %>% 
  bind_rows(table_name_1990 %>% 
              mutate(model = "1990",
                     full = "Yes")) %>% 
  bind_rows(table_name_2001 %>% 
              mutate(model = "2001",
                     full = "Yes")) %>% 
  bind_rows(table_name_2010 %>% 
              mutate(model = "2010",
                     full = "Yes")) %>% 
  bind_rows(table_name_2015 %>% 
              mutate(model = "2015-2019",
                     full = "Yes")) %>% 
  rename(cf = CF,
         fit = rr_lag0,
         fit.low = lci_lag0,
         fit.high = uci_lag0)

splines_period_plot <- ggplot() + 
  # geom_ribbon(data=period_cb_estimates_df %>% 
  #               filter(model=="Full model"),
  #             aes(x=cf, y=fit,
  #                 ymin=fit.low, ymax=fit.high), 
  #             alpha=0.1, fill="dodgerblue3", color=NA) +
  # geom_line(data=period_cb_estimates_df %>% 
  #             filter(model=="Full model"),
  #           aes(x=cf, y=fit), size=0.8, color="black") +
  geom_ribbon(data=period_cb_estimates_df %>% 
                filter(model!="Full model"),
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=period_cb_estimates_df %>% 
              filter(model!="Full model"),
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  coord_cartesian(ylim=c(0,6)) +
  ggtitle("Period subsets", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(plot.margin = margin(t = 2.5, r=2.5, l=2.5, b=-2.5, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        plot.title = element_blank(), 
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=10, family = "Helvetica", face="bold"))  

histogram <- ggplot(full_df_1, aes(x=cf, group=period, fill=period)) +
  geom_histogram(alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(plot.margin = margin(t = -5, r=2.5, l=2.5, b=2.5, unit = "mm"),
        axis.text = element_text(size=11, family="Helvetica"),
        legend.title = element_blank(),
        legend.position = "none",
        plot.title = element_blank(),
        axis.title = element_text(size=12, family = "Helvetica", face="bold"))  

splines_period_histogram_plot <- cowplot::plot_grid(
  splines_period_plot,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.7)
)

# splines_period_histogram_plot

# 2.1.3 Full main period subset results plot -------


period_figure_full <- cowplot::plot_grid(
  period_glm_fig,
  splines_period_histogram_plot,
  rel_heights = c(1, 1.2),
  nrow=2
  
)

# cowplot::ggsave2("~/Dropbox/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/period_figure_full_Dec2021.png",
#                  period_figure_full,
#                  width = 275,
#                  height = 250,
#                  units = "mm",
#                  dpi = 200
# )


# 2.1.4 Comparisons for 45% to 55% and 75% to 85% ---------


# 45% to 55% -------

# 1990 -------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="1990") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_1990.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="1990") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_1990.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2001 -------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2001") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2001.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2001") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2001.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2010 -------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2010") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2010.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2010") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2010.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2015-2019 -------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2015-2019") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2015.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2015-2019") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2015.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Review estimates (find estimate closest to 55%) -------
view(table_name_1990.45)
view(table_name_2001.45)
view(table_name_2010.45)
view(table_name_2015.45)

# 75% to 85% -------

# 1990 -------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="1990") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_1990.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="1990") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_1990.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2001 -------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2001") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2001.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2001") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2001.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2010 -------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2010") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2010.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2010") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2010.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# 2015-2019 -------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% filter(period=="2015-2019") %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_4_2015.cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1 %>% filter(period=="2015-2019") %>% dplyr::select(cf) %>% unlist, # exposure
  bylag = .1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_2015.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Review estimates (find estimate closest to 85%) -------
view(table_name_1990.75)
view(table_name_2001.75)
view(table_name_2010.75)
view(table_name_2015.75)

# 2.2 Region specific -------


full_df_1 <- full_df_1 %>% 
  mutate(region = 
           ifelse(province=="19" |
                    province=="14" |
                    province=="16" |
                    province=="15" |
                    province == "21", "Amazon",
                  ifelse(province=="11" |
                           province=="01" |
                           province == "10" |
                           province=="06" |
                           province=="02" |
                           province == "03" | # canar
                           province=="05" |
                           province=="17" |
                           province=="04", "Andes", 
                         ifelse(province=="07" |
                                  province=="09" |
                                  province=="12" |
                                  province=="13" |
                                  province == "08" | # esmeraldas
                                  province == "18" | # tungurahua
                                  province=="20", "Coast", NA)))) %>% 
  mutate(region_f = factor(region, levels=c("Andes", "Amazon", "Coast"))) %>% 
  mutate(region_andes = ifelse(region=="Andes", 0, 1),
         region_amazon = ifelse(region=="Amazon", 0, 1),
         region_coast = ifelse(region=="Coast", 0, 1))

# Region Fixed Effect -------

alri5_glm_0_mod_region_fe <- 
  fixest::feglm(ALRI5_5_plus ~
                  offset(log(under5population)) + 
                  
                  cf10  |
                  
                  
                  as.factor(canton_id_universal) +
                  as.factor(region) +
                  as.factor(period)
                ,
                data=full_df_1,
                family = quasipoisson())


alri5_glm_1_mod_region_fe <- 
  fixest::feglm(ALRI5_5_plus ~
                  offset(log(under5population)) + 
                  
                  cf10 +
                  
                  percent_rural_corrected_ratio + 
                  electricidad_no + 
                  
                  # material_techo_nicer_all + 
                  # material_techo_hormigon_losa_cemento_use + 
                  # material_pared_hormigon_bloque_ladrillo + 
                  # material_piso_nicer + 
                  # material_piso_tierra + 
                  materials_index + 
                  
                  # obtiene_agua_redpublica_tuberia_adentro_use + 
                  # servicio_higenico_redpublica_use + 
                  # elimina_servidas_red_pozo + 
                  elimina_servidas_red_pozo_letrina + 
                  # servicio_ducha_excluso + 
                  # elimina_basura_service + 
                  # hygiene_index + 
                  
                  Literate_women + 
                  InSchool_girls + 
                  Idioma_indigena_self + 
                  
                  # BCG_use + 
                  # DPT3_use + 
                  # SAR_use + 
                  # Polio_use + 
                  pcv3_use + 
                  vaccine_index + 
                  
                  ANC_use +  
                  ANC_visits_median_use |
                  
                  
                  as.factor(canton_id_universal) +
                  as.factor(region) +
                  as.factor(period)
                ,
                data=full_df_1,
                family = quasipoisson())

# cross basis ------
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1 %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df=3)) # I use the df I found in the test model (above)

alri5_gam_0_mod_cb_region_fe <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        ad.cb +
        
        as.factor(region) +
        as.factor(canton_id_universal) +
        as.factor(period)
      ,
      data=full_df_1,
      family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_0_mod_cb_region_fe, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_1 %>% dplyr::select(cf) %>% unlist(),
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_0_region_fe <- as.data.frame(bind_cols(fit, lci, uci)) 

alri5_gam_1_mod_cb_region_fe <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        ad.cb +
        
        percent_rural_corrected_ratio + 
        electricidad_no + 
        
        # material_techo_nicer_all + 
        # material_techo_hormigon_losa_cemento_use + 
        # material_pared_hormigon_bloque_ladrillo + 
        # material_piso_nicer + 
        # material_piso_tierra + 
        materials_index + 
        
        # obtiene_agua_redpublica_tuberia_adentro_use + 
        # servicio_higenico_redpublica_use + 
        # elimina_servidas_red_pozo + 
        elimina_servidas_red_pozo_letrina + 
        # servicio_ducha_excluso + 
        # elimina_basura_service + 
        # hygiene_index + 
        
        Literate_women + 
        InSchool_girls + 
        Idioma_indigena_self + 
        
        # BCG_use + 
        # DPT3_use + 
        # SAR_use + 
        # Polio_use + 
        pcv3_use + 
        vaccine_index + 
        
        ANC_use +  
        ANC_visits_median_use +
        
        as.factor(region) +
        as.factor(canton_id_universal) +
        as.factor(period)
      ,
      data=full_df_1,
      family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb_region_fe, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_1 %>% dplyr::select(cf) %>% unlist(),
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_1_region_fe <- as.data.frame(bind_cols(fit, lci, uci)) 

region_fe_spline <- 
  ggplot(table_name_0_region_fe %>% 
         mutate(model="Empty model\nwith region FE") %>% 
         bind_rows(table_name_1_region_fe %>% 
                     mutate(model="Preferred specification\nwith region FE")) %>% 
         rename(cf = CF,
                fit = rr_lag0,
                fit.low = lci_lag0,
                fit.high = uci_lag0), 
       aes(x=cf, y=fit, ymin=fit.low, ymax=fit.high, color=model, fill=rev(model))) + 
         geom_ribbon(alpha=0.1, color=NA) +
         geom_line(size=0.8) +
         ggsci::scale_color_lancet() + 
         theme_bw() + 
         geom_hline(yintercept = 1, linetype = 2, size=0.3) +
         coord_cartesian(ylim=c(0.4,4.8)) +
         ggtitle("", subtitle="Quasi-Poisson generalized additive mixed models") + 
         scale_x_continuous(labels=scales::percent_format()) + 
         xlab("Clean fuel use") + 
         ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
         theme(axis.text = element_text(size=10, family="Helvetica"),
               axis.title.x=element_blank(),
               legend.position = "none",
               legend.title = element_blank(),
               plot.title = element_blank(), 
               legend.text = element_text(size=10, family="Helvetica"),
               axis.title.y = element_text(size=8, family = "Helvetica", face="bold"))  
       
       
alri5_glm_estimates_region_fe <- 
  tidy(alri5_glm_0_mod_region_fe)[1,] %>% 
  mutate(model = "Empty model\nwith region FE") %>% 
  bind_rows(tidy(alri5_glm_1_mod_region_fe)[1,] %>% 
              mutate(model = "Preferred specification\nwith region FE")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))) 

# 5.1 Make figure ---------

alternative_specifications_glm_region_fe <- ggplot() +
  geom_interval(data=alri5_glm_estimates_region_fe, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50, size = 2.5) +
  geom_interval(data=alri5_glm_estimates_region_fe, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66, size = 3) +
  geom_interval(data=alri5_glm_estimates_region_fe, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95, size = 3.5) +  
  geom_point(data=alri5_glm_estimates_region_fe, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  ggsci::scale_color_lancet() + 
  
  ggtitle("Inclusion of regional fixed effect",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)\nper 10 p.p. increase in %CF") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.6, 0.7, 0.8, 0.9, 1.0)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  # scale_color_viridis_d() + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=8, color="black"),
        axis.text.x = element_text(size=8, color="black"),
        axis.title = element_text(size=8, face="bold"),
        plot.title = element_text(size=10, face="bold"),
        legend.position="none",
        legend.title = element_blank(),
        legend.text=element_text(size=10, color="black"))


region_fe_plot <- 
  plot_grid(
    alternative_specifications_glm_region_fe,
    region_fe_spline,
    histogram,
    nrow=3,
    align="hv",
    rel_heights=c(1, 1, 0.8)
  )

region_fe_plot


alri5_gam_1_mod_regioninteract <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        s(cf, by=region_f) +
        
        percent_rural_corrected_ratio + 
        electricidad_no + 
        
        # material_techo_nicer_all + 
        # material_techo_hormigon_losa_cemento_use + 
        # material_pared_hormigon_bloque_ladrillo + 
        # material_piso_nicer + 
        # material_piso_tierra + 
        materials_index + 
        
        # obtiene_agua_redpublica_tuberia_adentro_use + 
        # servicio_higenico_redpublica_use + 
        # elimina_servidas_red_pozo + 
        elimina_servidas_red_pozo_letrina + 
        # servicio_ducha_excluso + 
        # elimina_basura_service + 
        # hygiene_index + 
        
        Literate_women + 
        InSchool_girls + 
        Idioma_indigena_self + 
        
        # BCG_use + 
        # DPT3_use + 
        # SAR_use + 
        # Polio_use + 
        pcv3_use + 
        vaccine_index + 
        
        ANC_use +  
        ANC_visits_median_use +
        
        # as.factor(region) + 
        as.factor(canton_id_universal) +
        as.factor(period)
      ,
      data=full_df_1,
      family = quasipoisson())

plot(alri5_gam_1_mod_regioninteract)

alri5_gam_1_mod_regionfe_cb <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        ad.cb*region_f +
        
        percent_rural_corrected_ratio + 
        electricidad_no + 
        
        # material_techo_nicer_all + 
        # material_techo_hormigon_losa_cemento_use + 
        # material_pared_hormigon_bloque_ladrillo + 
        # material_piso_nicer + 
        # material_piso_tierra + 
        materials_index + 
        
        # obtiene_agua_redpublica_tuberia_adentro_use + 
        # servicio_higenico_redpublica_use + 
        # elimina_servidas_red_pozo + 
        elimina_servidas_red_pozo_letrina + 
        # servicio_ducha_excluso + 
        # elimina_basura_service + 
        # hygiene_index + 
        
        Literate_women + 
        InSchool_girls + 
        Idioma_indigena_self + 
        
        # BCG_use + 
        # DPT3_use + 
        # SAR_use + 
        # Polio_use + 
        pcv3_use + 
        vaccine_index + 
        
        ANC_use +  
        ANC_visits_median_use +
        
        # as.factor(region) + 
        as.factor(canton_id_universal) +
        as.factor(period)
      ,
      data=full_df_1,
      family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_regionfe_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_1 %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_coast <- as.data.frame(bind_cols(fit, lci, uci)) 



alri5_gam_1_mod_regionfe <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        s(cf, by=region_f) +
        
        percent_rural_corrected_ratio + 
        electricidad_no + 
        
        # material_techo_nicer_all + 
        # material_techo_hormigon_losa_cemento_use + 
        # material_pared_hormigon_bloque_ladrillo + 
        # material_piso_nicer + 
        # material_piso_tierra + 
        materials_index + 
        
        # obtiene_agua_redpublica_tuberia_adentro_use + 
        # servicio_higenico_redpublica_use + 
        # elimina_servidas_red_pozo + 
        elimina_servidas_red_pozo_letrina + 
        # servicio_ducha_excluso + 
        # elimina_basura_service + 
        # hygiene_index + 
        
        Literate_women + 
        InSchool_girls + 
        Idioma_indigena_self + 
        
        # BCG_use + 
        # DPT3_use + 
        # SAR_use + 
        # Polio_use + 
        pcv3_use + 
        vaccine_index + 
        
        ANC_use +  
        ANC_visits_median_use +
        
        # as.factor(region) + 
        as.factor(canton_id_universal) +
        as.factor(period)
      ,
      data=full_df_1,
      family = quasipoisson())




alri5_gam_1_mod_regionfe <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        s(cf, by=region_f) +
        
        percent_rural_corrected_ratio + 
        electricidad_no + 
        
        # material_techo_nicer_all + 
        # material_techo_hormigon_losa_cemento_use + 
        # material_pared_hormigon_bloque_ladrillo + 
        # material_piso_nicer + 
        # material_piso_tierra + 
        materials_index + 
        
        # obtiene_agua_redpublica_tuberia_adentro_use + 
        # servicio_higenico_redpublica_use + 
        # elimina_servidas_red_pozo + 
        elimina_servidas_red_pozo_letrina + 
        # servicio_ducha_excluso + 
        # elimina_basura_service + 
        # hygiene_index + 
        
        Literate_women + 
        InSchool_girls + 
        Idioma_indigena_self + 
        
        # BCG_use + 
        # DPT3_use + 
        # SAR_use + 
        # Polio_use + 
        pcv3_use + 
        vaccine_index + 
        
        ANC_use +  
        ANC_visits_median_use +
        
        # as.factor(region) + 
        as.factor(canton_id_universal) +
        as.factor(period)
      ,
      data=full_df_1,
      family = quasipoisson())

plot(alri5_gam_1_mod_regionfe)






tidy(main_model_region_interact)

main_model_region_interact <- 
  fixest::feglm(
    ALRI5_5_plus ~
      offset(log(under5population)) + 
      
      cf10*region_f +
      
      percent_rural_corrected_ratio + 
      electricidad_no + 
      
      
      materials_index + 
      
      # obtiene_agua_redpublica_tuberia_adentro_use + 
      # servicio_higenico_redpublica_use + 
      # elimina_servidas_red_pozo + 
      elimina_servidas_red_pozo_letrina + 
      # servicio_ducha_excluso + 
      # elimina_basura_service + 
      # hygiene_index + 
      
      Literate_women + 
      InSchool_girls + 
      Idioma_indigena_self + 
      
      # BCG_use + 
      # DPT3_use + 
      # SAR_use + 
      # Polio_use + 
      pcv3_use + 
      vaccine_index + 
      
      ANC_use +  
      ANC_visits_median_use 
    |
      # as.factor(region) + 
      canton_id_universal +
      period,
    data=full_df_1,
    family = quasipoisson())

summary(marginaleffects::marginaleffects(main_model_region_interact,type = "link"))

region_fe_linear <- 
  summary(marginaleffects::marginaleffects(main_model_region_interact,type = "link"), by="region_f")[1:3,]

region_fe_linear_df <- 
  region_fe_linear %>% 
  as_tibble() %>% 
  mutate(term = c("Coast", "Andes", "Amazon")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))
  ) 

region_fe_linear_fig <- 
  ggplot() +
  geom_interval(data=region_fe_linear_df, 
                aes(x=term, y=irr, ymin=int95_low, ymax=int95_hi, color=term), alpha=0.50, size = 2.5) +
  geom_interval(data=region_fe_linear_df, 
                aes(x=term, y=irr, ymin=int80_low, ymax=int80_hi, color=term), alpha=0.66, size = 3) +
  geom_interval(data=region_fe_linear_df, 
                aes(x=term, y=irr, ymin=int66_low, ymax=int66_hi, color=term), alpha=0.95, size = 3.5) +  
  geom_point(data=region_fe_linear_df, 
             aes(x=term, y=irr), size=2, shape=18, color="white") + 
  ggsci::scale_color_lancet() + 
  
  ggtitle("Regional interaction plot",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio\n(66%, 80%, 95% CI)\nper 10 p.p. increase in %CF") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  # scale_color_viridis_d() + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=12, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=14, face="bold"),
        legend.position="top",
        legend.title = element_blank(),
        legend.text=element_text(size=10, color="black"))


n linear plots -------
p0_region1 <- plot(alri5_gam_1_mod_regionfe, shade=TRUE, select=2)

estimate0_region1 <- unlist(p0_region1[[2]]$fit) %>% 
  bind_cols(p0_region1[[2]]$se) %>% 
  bind_cols(p0_region1[[2]]$x) %>% 
  rename(cf = `...3`,
         fit = `...1`,
         se = `...2`) %>% 
  mutate(fit.high = fit + se,
         fit.low = fit - se) %>% 
  mutate(region="Coast")

p0_region2 <- plot(alri5_gam_1_mod_regionfe, shade=TRUE, select=3)

estimate0_region2 <- unlist(p0_region2[[3]]$fit) %>% 
  bind_cols(p0_region2[[3]]$se) %>% 
  bind_cols(p0_region2[[3]]$x) %>% 
  rename(cf = `...3`,
         fit = `...1`,
         se = `...2`) %>% 
  mutate(fit.high = fit + se,
         fit.low = fit - se) %>% 
  mutate(region="Andes")

p0_region3 <- plot(alri5_gam_1_mod_regionfe, shade=TRUE, select=1)

estimate0_region3 <- unlist(p0_region3[[1]]$fit) %>% 
  bind_cols(p0_region3[[1]]$se) %>% 
  bind_cols(p0_region3[[1]]$x) %>% 
  rename(cf = `...3`,
         fit = `...1`,
         se = `...2`) %>% 
  mutate(fit.high = fit + se,
         fit.low = fit - se) %>% 
  mutate(region="Amazon")


estimate0_regions <- 
  estimate0_region3 %>% 
  bind_rows(estimate0_region1, estimate0_region2) %>% 
  mutate(region_f = factor(region, levels=c("Andes", "Amazon", "Coast")))

region_fe_nonlinear_fig <- 
  ggplot(estimate0_regions, 
         aes(x=cf, y=fit, ymin=fit.low, ymax=fit.high, color=region_f, fill=region_f, group=region_f)) +
  geom_ribbon(alpha=0.1, color=NA) + 
  geom_line(size=0.8) + 
  geom_hline(yintercept = 0, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  ggsci::scale_color_lancet() + 
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) +
  # scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
  #                             1.2, 1.4,  1.6, 1.8, 2.0, 2.2),
  #                    minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1)) +
  xlab("Clean fuel use") + 
  ylab("Response of under-5 LRI\nmortality relative to the Mean\n(71% CF)") + 
  theme(legend.position = "none",
        axis.title.y = element_text(size=10))  


region_fe_nonlinear_fig

region_histogram <- ggplot(full_df_1, aes(x=cf, group=region, fill=region)) +
  geom_histogram(alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(legend.position = "none",
        axis.title.y = element_text(size=10))  

region_fe_figure <- cowplot::plot_grid(
  region_fe_linear_fig,
  region_fe_nonlinear_fig,
  region_histogram,
  align="hv",
  rel_heights = c(1, 1, 0.5),
  nrow=3
)

region_fe_figure



# 2.2.0 Create subsets ------

full_df_main_nog <- full_df_1 %>%
  filter(province!="20")

full_df_amazon <- full_df_1 %>% 
  filter(province=="19" |
           province=="14" |
           province=="16" |
           province=="15" |
           province == "21")

full_df_andes <- full_df_1 %>% 
  filter(province=="11" |
           province=="01" |
           province == "10" |
           province=="06" |
           province=="02" |
           province == "03" | # canar
           province=="05" |
           province=="17" |
           province=="04")

full_df_coast <- full_df_1 %>% 
  filter(province=="07" |
           province=="09" |
           province=="12" |
           province=="13" |
           province == "08" | # esmeraldas
           province == "18" | # tungurahua
           province=="20")

full_df_coast_nog <- full_df_1 %>% 
  filter(province=="07" |
           province=="09" |
           province=="12" |
           province=="13" |
           province == "08" | # esmeraldas
           province == "18")

full_df_1 <- full_df_1 %>% 
  mutate(region = ifelse(province=="19" |
                           province=="14" |
                           province=="16" |
                           province=="15" |
                           province == "21", "Amazon",
                         ifelse(province=="11" |
                                  province=="01" |
                                  province == "10" |
                                  province=="06" |
                                  province=="02" |
                                  province == "03" | # canar
                                  province=="05" |
                                  province=="17" |
                                  province=="04", "Andes",
                                ifelse(province=="07" |
                                         province=="09" |
                                         province=="12" |
                                         province=="13" |
                                         province == "08" | # esmeraldas
                                         province == "18" | # tungurahua
                                         province=="20",
                                       "Coast", NA))))

# 2.2.1 Regional GLMs --------


alri5_glm_1_nog <- fixest::feglm(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         cf10 +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         materials_index + 
                         elimina_servidas_red_pozo_letrina + 
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use |
                         canton_id_universal + 
                           period
                       ,
                       data=full_df_main_nog,
                       family = quasipoisson())

alri5_glm_1_amazon <- fixest::feglm(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            cf10 +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            materials_index + 
                            elimina_servidas_red_pozo_letrina + 
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use |
                              canton_id_universal + 
                              period
                          ,
                          data=full_df_amazon,
                          family = quasipoisson())

alri5_glm_1_andes <- fixest::feglm(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           cf10 +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use |
                             canton_id_universal + 
                             period
                         ,
                         data=full_df_andes,
                         family = quasipoisson())

alri5_glm_1_coast <- fixest::feglm(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           cf10 +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use |
                           canton_id_universal + 
                             period
                         ,
                         data=full_df_coast,
                         family = quasipoisson())

alri5_glm_1_coastnog <- fixest::feglm(ALRI5_5_plus ~
                              offset(log(under5population)) + 
                              
                              cf10 +
                              
                              percent_rural_corrected_ratio + 
                              electricidad_no + 
                              
                              materials_index + 
                              elimina_servidas_red_pozo_letrina + 
                              Literate_women + 
                              InSchool_girls + 
                              Idioma_indigena_self + 
                              
                              pcv3_use + 
                              vaccine_index + 
                              
                              ANC_use +  
                              ANC_visits_median_use |
                                canton_id_universal + 
                                period
                            ,
                            data=full_df_coast_nog,
                            family = quasipoisson())



# Assimilate GLM estimates -------

alri5_glm_region_estimates <- tidy(alri5_glm_1_mod)[1,] %>% 
  mutate(model = "Full model") %>% 
  bind_rows(tidy(alri5_glm_1_nog)[1,] %>% 
              mutate(model = "Full model,\ndrop Galapagos")) %>% 
  bind_rows(tidy(alri5_glm_1_amazon)[1,] %>% 
              mutate(model = "Amazonian region")) %>% 
  bind_rows(tidy(alri5_glm_1_andes)[1,] %>% 
              mutate(model = "Andean region")) %>% 
  bind_rows(tidy(alri5_glm_1_coast)[1,] %>% 
              mutate(model = "Coastal region")) %>% 
  bind_rows(tidy(alri5_glm_1_coastnog)[1,] %>% 
              mutate(model = "Coastal region,\ndrop Galapagos")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))) 

# Make figure ---------

region_glm_fig <- ggplot() + 
  geom_interval(data=alri5_glm_region_estimates %>% 
                  filter(#model=="Full model" |
                           model=="Amazonian region" |
                           model=="Andean region" |
                           model=="Coastal region") %>% 
                  mutate(model = factor(model, levels=c(# "Full model", 
                    "Amazonian region", "Andean region", "Coastal region"))), 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50) + 
  geom_interval(data=alri5_glm_region_estimates  %>% 
                  filter(# model=="Full model" |
                           model=="Amazonian region" |
                           model=="Andean region" |
                           model=="Coastal region")  %>% 
                  mutate(model = factor(model, levels=c(# "Full model", 
                    "Amazonian region", "Andean region", "Coastal region"))), 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66) + 
  geom_interval(data=alri5_glm_region_estimates  %>% 
                  filter(# model=="Full model" |
                           model=="Amazonian region" |
                           model=="Andean region" |
                           model=="Coastal region")  %>% 
                  mutate(model = factor(model, levels=c(# "Full model", 
                    "Amazonian region", "Andean region", "Coastal region"))), 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95) + 
  geom_point(data=alri5_glm_region_estimates  %>% 
               filter(# model=="Full model" |
                        model=="Amazonian region" |
                        model=="Andean region" |
                        model=="Coastal region")  %>% 
               mutate(model = factor(model, levels=c(# "Full model", 
                 "Amazonian region", "Andean region", "Coastal region"))), 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  scale_x_discrete(limits = c(#"Full model", 
    "Amazonian region", "Andean region", "Coastal region")) + 
  ggsci::scale_color_lancet() + 
  # coord_cartesian()
  ggtitle("Regional subsets", subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)") + 
  xlab("") + 
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)) + 
  # coord_cartesian(ylim=c(0.4, 1.6)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=10, color="black"), 
        axis.text.x = element_text(size=10, color="black"), 
        axis.title = element_text(size=10, face="bold"), 
        plot.title = element_text(size=12, face="bold"),
        legend.position = "top", 
        legend.text = element_text(size=10, color="black"),
        legend.title = element_blank())

region_glm_fig

# Full


# 2.2.2 Regional GAMMs --------



# Amazon ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_amazon %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_1_amazon <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            materials_index + 
                            elimina_servidas_red_pozo_letrina + 
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            as.factor(period) + 
                            as.factor(canton_id_universal)
                          ,
                          data=full_df_amazon,
                          family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_amazon, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_amazon %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_amazon <- as.data.frame(bind_cols(fit, lci, uci)) 

# Andes ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_andes %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_1_andes <- gam(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           ad.cb +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use +
                           as.factor(period) + 
                           as.factor(canton_id_universal)
                         ,
                         data=full_df_andes,
                         family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_andes, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_andes %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_andes <- as.data.frame(bind_cols(fit, lci, uci)) 

# Coastal ------


ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_coast %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_coast <- gam(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           ad.cb +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use +
                           as.factor(period) + 
                           as.factor(canton_id_universal)
                         ,
                         data=full_df_coast,
                         family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_coast, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_coast %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_coast <- as.data.frame(bind_cols(fit, lci, uci)) 


# 2.2.3 Combine and plot --------

region_cb_estimates_df <- table_name_mean %>% 
  mutate(model = "Full model",
         full = "Yes") %>% 
  bind_rows(table_name_andes %>% 
              mutate(model = "Andean region",
                     full = "Yes")) %>% 
  bind_rows(table_name_coast %>% 
              mutate(model = "Coastal region",
                     full = "Yes")) %>% 
  bind_rows(table_name_amazon %>% 
              mutate(model = "Amazonian region",
                     full = "Yes")) %>% 
  rename(cf = CF,
         fit = rr_lag0,
         fit.low = lci_lag0,
         fit.high = uci_lag0)

splines_region_plot <- ggplot() + 
  # geom_ribbon(data=period_cb_estimates_df %>% 
  #               filter(model=="Full model"),
  #             aes(x=cf, y=fit,
  #                 ymin=fit.low, ymax=fit.high), 
  #             alpha=0.1, fill="dodgerblue3", color=NA) +
  # geom_line(data=period_cb_estimates_df %>% 
  #             filter(model=="Full model"),
  #           aes(x=cf, y=fit), size=0.8, color="black") +
  geom_ribbon(data=region_cb_estimates_df %>% 
                filter(model!="Full model"),
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=region_cb_estimates_df %>% 
              filter(model!="Full model"),
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  coord_cartesian(ylim=c(0,5)) +
  ggtitle("Regional subsets", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        plot.title = element_blank(), 
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=10, family = "Helvetica", face="bold"))  

histogram <- ggplot(full_df_1, aes(x=cf, group=region, fill=region)) +
  geom_histogram(alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(axis.text = element_text(size=10, family="Helvetica"),
        legend.title = element_blank(),
        legend.position = "none",
        axis.title = element_text(size=10, family = "Helvetica", face="bold"))  

splines_region_histogram_plot <- cowplot::plot_grid(
  splines_region_plot,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.6)
)

splines_region_histogram_plot

# 2.2.3.1 Full region plot -------

region_figure_full <- 
  cowplot::plot_grid(
    region_glm_fig,
    splines_region_histogram_plot,
    rel_heights=c(0.8, 1),
    nrow=2
    
  )

cowplot::ggsave2("~/Dropbox/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/region_figure_full_Dec2021.png",
                 region_figure_full,
                 width = 275,
                 height = 250,
                 units = "mm",
                 dpi = 200
)




# 2.2.4 Extra comparisons for 45% to 55% and 75% to 85% ---------

# 45% to 55% -------

# Amazon ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_amazon %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_1_amazon <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            materials_index + 
                            elimina_servidas_red_pozo_letrina + 
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            as.factor(period) + 
                            as.factor(canton_id_universal)
                          ,
                          data=full_df_amazon,
                          family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_amazon, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_amazon %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_amazon.45 <- as.data.frame(bind_cols(fit, lci, uci)) 



# Andes ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_andes %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_1_andes <- gam(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           ad.cb +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use +
                           as.factor(period) + 
                           as.factor(canton_id_universal)
                         ,
                         data=full_df_andes,
                         family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_andes, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_andes %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_andes.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Coastal ------


ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_coast %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_coast <- gam(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           ad.cb +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use +
                           as.factor(period) + 
                           as.factor(canton_id_universal)
                         ,
                         data=full_df_coast,
                         family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_coast, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_coast %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_coast.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Review estimates (find estimate closest to 55%) -------
view(table_name_andes.45)
view(table_name_amazon.45)
view(table_name_coast.45)


# 75% to 85% -------

# Amazon ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_amazon %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_1_amazon <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            materials_index + 
                            elimina_servidas_red_pozo_letrina + 
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            as.factor(period) + 
                            as.factor(canton_id_universal)
                          ,
                          data=full_df_amazon,
                          family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_amazon, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_amazon %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_amazon.75 <- as.data.frame(bind_cols(fit, lci, uci)) 



# Andes ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_andes %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)

alri5_gam_1_andes <- gam(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           ad.cb +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use +
                           as.factor(period) + 
                           as.factor(canton_id_universal)
                         ,
                         data=full_df_andes,
                         family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_andes, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_andes %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_andes.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Coastal ------


ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_coast %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_coast <- gam(ALRI5_5_plus ~
                           offset(log(under5population)) + 
                           
                           ad.cb +
                           
                           percent_rural_corrected_ratio + 
                           electricidad_no + 
                           
                           materials_index + 
                           elimina_servidas_red_pozo_letrina + 
                           Literate_women + 
                           InSchool_girls + 
                           Idioma_indigena_self + 
                           
                           pcv3_use + 
                           vaccine_index + 
                           
                           ANC_use +  
                           ANC_visits_median_use +
                           as.factor(period) + 
                           as.factor(canton_id_universal)
                         ,
                         data=full_df_coast,
                         family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_coast, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_coast %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_coast.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Review estimates (find estimate closest to 55%) -------
view(table_name_andes.75)
view(table_name_amazon.75)
view(table_name_coast.75)

# 2.3 Sex specific -------

mortality_sexo <- read_rds("~/BurkeLab Dropbox/Carlos Gould/Ecuador National Health and LPG Analysis/Data/Datasets/Created_by_me/canton_mortality_rollmean_sexo_June1.rds")
# full_df_1_sex_1 <- read_csv("~/Desktop/full_df_1_sex_1.csv")

full_df_1_female <- full_df_1_sex_1 %>% 
  filter(sexo=="2")

full_df_1_female_2 <- full_df_1 %>% 
  left_join(full_df_1_female %>% 
              dplyr::select(canton_id_universal, period, 
                            sexo, sex_ratio, sex_ratio_use, sex_ratio_use1, sex_ratio_use2, 
                            under5population_girls_use, 
                            ALRI5_5_plus) %>% 
              dplyr::rename(ALRI5_5_plus_female = ALRI5_5_plus), by = c("canton_id_universal", "period")) %>% 
  mutate(ALRI5_5_plus_female_1 = ifelse(is.na(ALRI5_5_plus_female), 0, ALRI5_5_plus_female)) %>% 
  group_by(canton_id_universal) %>% 
  mutate(sex_ratio_mean = mean(sex_ratio, na.rm=T)) %>% 
  mutate(sex_ratio_use3 = lag(sex_ratio, 3),
         sex_ratio_use4 = lag(sex_ratio, 4)) %>% 
  mutate(sex_ratio_use_1 = ifelse(is.na(sex_ratio), sex_ratio_use1, sex_ratio)) %>% 
  mutate(sex_ratio_use_2 = ifelse(is.na(sex_ratio_use_1), sex_ratio_use2, sex_ratio_use_1)) %>% 
  mutate(sex_ratio_use_3 = ifelse(is.na(sex_ratio_use_2), sex_ratio_use3, sex_ratio_use_2)) %>% 
  mutate(sex_ratio_use_4 = ifelse(is.na(sex_ratio_use_3), sex_ratio_use4, sex_ratio_use_3)) %>% 
  mutate(sex_ratio_use_5 = ifelse(is.na(sex_ratio_use_4), sex_ratio_mean, sex_ratio_use_4)) %>% 
  mutate(under5population_female = ifelse(is.na(under5population_girls_use), under5population*sex_ratio_use_5, under5population_girls_use))

full_df_1_male <- full_df_1_sex_1 %>% 
  filter(sexo=="1")

full_df_1_male_2 <- full_df_1 %>% 
  left_join(full_df_1_male %>% 
              dplyr::select(canton_id_universal, period, 
                            under5population_sex_other,
                            under5population_boys_use, 
                            ALRI5_5_plus) %>% 
              dplyr::rename(ALRI5_5_plus_male = ALRI5_5_plus), by = c("canton_id_universal", "period")) %>% 
  left_join(full_df_1_female_2 %>% dplyr::select(canton_id_universal, period, under5population_female)) %>% 
  mutate(ALRI5_5_plus_male_1 = ifelse(is.na(ALRI5_5_plus_male), 0, ALRI5_5_plus_male),
         sex_ratio = under5population_boys_use / (under5population_boys_use+under5population_female)) %>% 
  group_by(canton_id_universal) %>% 
  mutate(sex_ratio_mean = mean(sex_ratio, na.rm=T)) %>% 
  mutate(sex_ratio_use1 = lag(sex_ratio, 1),
         sex_ratio_use2 = lag(sex_ratio, 2),
         sex_ratio_use3 = lag(sex_ratio, 3),
         sex_ratio_use4 = lag(sex_ratio, 4)) %>% 
  mutate(sex_ratio_use_1 = ifelse(is.na(sex_ratio), sex_ratio_use1, sex_ratio)) %>% 
  mutate(sex_ratio_use_2 = ifelse(is.na(sex_ratio_use_1), sex_ratio_use2, sex_ratio_use_1)) %>% 
  mutate(sex_ratio_use_3 = ifelse(is.na(sex_ratio_use_2), sex_ratio_use3, sex_ratio_use_2)) %>% 
  mutate(sex_ratio_use_4 = ifelse(is.na(sex_ratio_use_3), sex_ratio_use4, sex_ratio_use_3)) %>% 
  mutate(sex_ratio_use_5 = ifelse(is.na(sex_ratio_use_4), sex_ratio_mean, sex_ratio_use_4)) %>% 
  mutate(under5population_male = ifelse(is.na(under5population_boys_use), under5population*sex_ratio_use_5, under5population_boys_use))


# 2.3.1 Sex specific GLMs --------

alri5_glm_1_female <- fixest::feglm(ALRI5_5_plus_female_1 ~
                            offset(log(under5population_female)) + 
                            
                            cf10 +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            materials_index + 
                            elimina_servidas_red_pozo_letrina + 
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use |
                            as.factor(canton_id_universal) +
                            as.factor(period) 
                          ,
                          data=full_df_1_female_2,
                          family = quasipoisson())

alri5_glm_1_male <- fixest::feglm(ALRI5_5_plus_male_1 ~
                          offset(log(under5population_male)) + 
                          
                          cf10 +
                          
                          percent_rural_corrected_ratio + 
                          electricidad_no + 
                          
                          materials_index + 
                          elimina_servidas_red_pozo_letrina + 
                          Literate_women + 
                          InSchool_girls + 
                          Idioma_indigena_self + 
                          
                          pcv3_use + 
                          vaccine_index + 
                          
                          ANC_use +  
                          ANC_visits_median_use |
                          as.factor(canton_id_universal) +
                          as.factor(period)
                        ,
                        data=full_df_1_male_2,
                        family = quasipoisson())



# Assimilate GLM estimates -------

alri5_glm_sex_estimates <- tidy(alri5_glm_1_mod)[1,] %>% 
  mutate(model = "Full model") %>% 
  bind_rows(tidy(alri5_glm_1_female)[1,] %>% 
              mutate(model = "Sex: Female")) %>% 
  bind_rows(tidy(alri5_glm_1_male)[1,] %>% 
              mutate(model = "Sex: Male")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))) %>% 
  filter(model != "Full model")

# Make figure ---------

sex_glm_fig <- ggplot() +
  geom_interval(data=alri5_glm_sex_estimates, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50) +
  geom_interval(data=alri5_glm_sex_estimates, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66) +
  geom_interval(data=alri5_glm_sex_estimates, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95) +  
  geom_point(data=alri5_glm_sex_estimates, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  
  scale_x_discrete(limits = c(# "Full model",
                              "Sex: Female",
                              "Sex: Male")) + 
  
  ggtitle("Sex subsets",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=10, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        legend.title = element_blank(),
        legend.position = "top",
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=12, face="bold"))

# 2.3.2 Sex GAMMs --------

# Female  --------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_female_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_female <- gam(ALRI5_5_plus_female_1 ~
                         offset(log(under5population_female)) + 
                         
                         ad.cb +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         materials_index + 
                         elimina_servidas_red_pozo_letrina + 
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         as.factor(period) + 
                         as.factor(canton_id_universal)
                       ,
                       data=full_df_1_female_2,
                       family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_female, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_1_female_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_main_female <- as.data.frame(bind_cols(fit, lci, uci)) 




# Male ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_male_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_male <- gam(ALRI5_5_plus_male_1 ~
                            offset(log(under5population_male)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            materials_index + 
                            elimina_servidas_red_pozo_letrina + 
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            as.factor(period) + 
                            as.factor(canton_id_universal)
                          ,
                          data=full_df_1_male_2,
                          family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_male, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_1_male_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_main_male <- as.data.frame(bind_cols(fit, lci, uci)) 


# 2.3.3 Combine and plot --------

sex_cb_estimates_df <- table_name_main_male %>% 
              mutate(model = "Sex: Male") %>% 
  bind_rows(table_name_main_female %>% 
              mutate(model = "Sex: Female")) %>% 
  rename(cf = CF,
         fit = rr_lag0,
         fit.low = lci_lag0,
         fit.high = uci_lag0)

sex_splines_plot <- ggplot() + 
  # geom_ribbon(data=period_cb_estimates_df %>% 
  #               filter(model=="Full model"),
  #             aes(x=cf, y=fit,
  #                 ymin=fit.low, ymax=fit.high), 
  #             alpha=0.1, fill="dodgerblue3", color=NA) +
  # geom_line(data=period_cb_estimates_df %>% 
  #             filter(model=="Full model"),
  #           aes(x=cf, y=fit), size=0.8, color="black") +
  geom_ribbon(data=sex_cb_estimates_df,
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=sex_cb_estimates_df,
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  coord_cartesian(ylim=c(0,2.1)) +
  ggtitle("Regional subsets", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        plot.title = element_blank(), 
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=11, family = "Helvetica", face="bold"))  

histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(alpha=0.8, color="black", bins = 50, fill="dodgerblue4") + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(axis.text = element_text(size=11, family="Helvetica"),
        legend.title = element_blank(),
        legend.position = "none",
        axis.title = element_text(size=12, family = "Helvetica", face="bold"))  

splines_sex_histogram_plot <- cowplot::plot_grid(
  sex_splines_plot,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.6)
)



# 2.2.3.1 Full region plot -------

sex_figure_full <- 
  cowplot::plot_grid(
    sex_glm_fig,
    splines_sex_histogram_plot,
    nrow=2
    
  )

cowplot::ggsave2("~/Dropbox/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/sex_figure_full_Dec2021.png",
                 sex_figure_full,
                 width = 275,
                 height = 250,
                 units = "mm",
                 dpi = 200
)




# 2.2.4 Extra comparisons for 45% to 55% and 75% to 85% ---------

# 45% to 55% -------

# Female ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_female_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_female, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_1_female_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_female.45 <- as.data.frame(bind_cols(fit, lci, uci)) 


# Male ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_male_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_male, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_1_male_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_male.45 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Review estimates (find estimate closest to 55%) -------
view(table_name_female.45)
view(table_name_male.45)


# 75% to 85% -------


# Female ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_female_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_female, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_1_female_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_female.75 <- as.data.frame(bind_cols(fit, lci, uci)) 


# Male ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_male_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_male, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_1_male_2 %>% ungroup() %>% dplyr::select(cf) %>% unlist(),         
  bylag = 0.10, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_male.75 <- as.data.frame(bind_cols(fit, lci, uci)) 

# Review estimates (find estimate closest to 55%) -------
view(table_name_female.75)
view(table_name_male.75)

# 3. Alternative model specifications, including random interepts, non-linear confounding, quasi-poisson vs. negative binomial, and removal of the galapagos -------

# 3.1 Random intercepts vs. fixed effects ------

# GLM 
full_df_1$offset <- log(full_df_1$under5population)

alri5_glm_re_1_mod <- glmer(ALRI5_5_plus ~
                              offset(log(under5population)) + 
                              
                              cf10 + 
                              
                              percent_rural_corrected_ratio + 
                              electricidad_no + 
                              
                              # material_techo_nicer_all + 
                              # material_techo_hormigon_losa_cemento_use + 
                              # material_pared_hormigon_bloque_ladrillo + 
                              # material_piso_nicer + 
                              # material_piso_tierra + 
                              materials_index + 
                              
                              # obtiene_agua_redpublica_tuberia_adentro_use + 
                              # servicio_higenico_redpublica_use + 
                              # elimina_servidas_red_pozo + 
                              elimina_servidas_red_pozo_letrina + 
                              # servicio_ducha_excluso + 
                              # elimina_basura_service + 
                              # hygiene_index + 
                              
                              Literate_women + 
                              InSchool_girls + 
                              Idioma_indigena_self + 
                              
                              # BCG_use + 
                              # DPT3_use + 
                              # SAR_use + 
                              # Polio_use + 
                              pcv3_use + 
                              vaccine_index + 
                              
                              ANC_use +  
                              ANC_visits_median_use +
                              
                              # as.factor(canton_id_universal) +
                              as.factor(period) + 
                              (1 | canton_id_universal),
                            data=full_df_1,
                            # random = list(canton_id_universal = ~1), # random effect
                            family = poisson())

alri5_glm_fe_re_estimates <- tidy(alri5_glm_1_mod)[1,] %>% 
  mutate(model = "Fixed effects for canton") %>% 
  bind_rows(tidy(alri5_glm_re_1_mod)[2,3:7] %>% 
              mutate(model = "Random intercepts for canton")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))) 

fe_re_glm <- ggplot() +
  geom_interval(data=alri5_glm_fe_re_estimates, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50) +
  geom_interval(data=alri5_glm_fe_re_estimates, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66) +
  geom_interval(data=alri5_glm_fe_re_estimates, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95) +  
  geom_point(data=alri5_glm_fe_re_estimates, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  
  ggsci::scale_color_lancet() + 
  
  scale_x_discrete(limits = c("Fixed effects for canton", "Random intercepts for canton")) + 
  
  ggtitle("Random intercepts vs. fixed effects for canton",
          subtitle="Fully adjusted generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=10, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=12, face="bold"),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size=10, color="black"))

fe_re_glm

# GAM
ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1$cf,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)



## model using crossbasis

alri5_gam_1_mod_cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          ,
                          data=full_df_1,
                          family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean <- as.data.frame(bind_cols(fit, lci, uci)) 

alri5_gamm_1_mod_random_cb <- mgcv::gamm(ALRI5_5_plus ~
                                        offset(offset) + 
                                        
                                        ad.cb + +
                                        
                                        percent_rural_corrected_ratio + 
                                        electricidad_no + 
                                        
                                        # material_techo_nicer_all + 
                                        # material_techo_hormigon_losa_cemento_use + 
                                        # material_pared_hormigon_bloque_ladrillo + 
                                        # material_piso_nicer + 
                                        # material_piso_tierra + 
                                        materials_index + 
                                        
                                        # obtiene_agua_redpublica_tuberia_adentro_use + 
                                        # servicio_higenico_redpublica_use + 
                                        # elimina_servidas_red_pozo + 
                                        elimina_servidas_red_pozo_letrina + 
                                        # servicio_ducha_excluso + 
                                        # elimina_basura_service + 
                                        # hygiene_index + 
                                        
                                        Literate_women + 
                                        InSchool_girls + 
                                        Idioma_indigena_self + 
                                        
                                        # BCG_use + 
                                        # DPT3_use + 
                                        # SAR_use + 
                                        # Polio_use + 
                                        pcv3_use + 
                                        vaccine_index + 
                                        
                                        ANC_use +  
                                        ANC_visits_median_use +
                                        
                                        # as.factor(canton_id_universal) +
                                        as.factor(period)
                                      ,
                                      data=full_df_1,
                                      random = list(canton_id_universal = ~1), # random effect
                                      family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gamm_1_mod_random_cb$gam, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_gamm <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_random_fixed <- ggplot(table_name_mean %>% 
                                mutate(model = "Fixed effects for canton") %>% 
                                bind_rows(table_name_mean_gamm %>% 
                                            mutate(model = "Random intercepts for canton")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, group=model, color=model, fill=model)) + 
  geom_ribbon(alpha=0.1, color=NA) + 
  geom_line(size=0.8) + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  # geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
  #            label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  ggsci::scale_color_lancet() + 
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8, 2.0, 2.2),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to the Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=10, family = "Helvetica", face="bold")) 


crossbasis_fig_random_fixed

histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(fill="dodgerblue4", alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(axis.text = element_text(size=11, family="Helvetica"),
        axis.title = element_text(size=11, family = "Helvetica", face="bold"))  

fe_re_spline_histogram_plot <- cowplot::plot_grid(
  crossbasis_fig_random_fixed,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.5)
)


fe_re_spline_histogram_plot

# full fe re plot -------

fe_re_full_plot <- cowplot::plot_grid(
  fe_re_glm,
  fe_re_spline_histogram_plot,
  rel_heights = c(0.8, 1),
  nrow=2
  
)

cowplot::ggsave2("~/Dropbox/Ecuador National Health and LPG Analysis/Final_Repository/Figures/Results/fe_re_full_plot_Dec2021.png",
                 fe_re_full_plot,
                 width = 250,
                 height = 250,
                 units = "mm",
                 dpi = 200
)

# 3.2 Non-linear confounding -------

alri5_gam_1_nlc_mod_cb <- gam(ALRI5_5_plus ~
                             offset(log(under5population)) + 
                             
                             ad.cb +
                             
                             s(percent_rural_corrected_ratio, k=3, fx=T) + 
                             s(electricidad_no, k=3, fx=T) + 
                             
                             # material_techo_nicer_all + 
                             # material_techo_hormigon_losa_cemento_use + 
                             # material_pared_hormigon_bloque_ladrillo + 
                             # material_piso_nicer + 
                             # material_piso_tierra + 
                             materials_index + 
                             
                             # obtiene_agua_redpublica_tuberia_adentro_use + 
                             # servicio_higenico_redpublica_use + 
                             # elimina_servidas_red_pozo + 
                             elimina_servidas_red_pozo_letrina + 
                             # servicio_ducha_excluso + 
                             # elimina_basura_service + 
                             # hygiene_index + 
                             
                             Literate_women + 
                             InSchool_girls + 
                             Idioma_indigena_self + 
                             
                             # BCG_use + 
                             # DPT3_use + 
                             # SAR_use + 
                             # Polio_use + 
                             pcv3_use + 
                             vaccine_index + 
                             
                             ANC_use +  
                             ANC_visits_median_use +
                             
                             as.factor(canton_id_universal) +
                             as.factor(period)
                           ,
                           data=full_df_1,
                           family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_nlc_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_nlc <- as.data.frame(bind_cols(fit, lci, uci)) 


nlc_estimates_df <- estimate1 %>% 
  mutate(model = "Linear confounding only") %>% 
  bind_rows(nlc_plot_estimates %>% 
              mutate(model = "Nonlinear confounding allowed")) 

crossbasis_fig_nlc <- ggplot(table_name_mean %>% 
                                        mutate(model = "Linear confounding only") %>% 
                                        bind_rows(table_name_mean_nlc %>% 
                                                    mutate(model = "Nonlinear confounding allowed")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, group=model, color=model, fill=model)) + 
  geom_ribbon(alpha=0.1, color=NA) + 
  geom_line(size=0.8) + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  # geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  # geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
  #            label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  ggsci::scale_color_lancet() + 
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8, 2.0, 2.2),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to the Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=10, family = "Helvetica", face="bold")) 


crossbasis_fig_nlc


histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(fill="dodgerblue4", alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(axis.text = element_text(size=11, family="Helvetica"),
        axis.title = element_text(size=11, family = "Helvetica", face="bold"))  

nlc_spline_histogram_plot <- cowplot::plot_grid(
  crossbasis_fig_nlc,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.5)
)

# plot(alri5_gam_1_mod)

nlc_spline_histogram_plot


# 3.3 Quasi-poisson vs. negative binomial ---------

alri5_glm_nb_1_mod <- fixest::fenegbin(ALRI5_5_plus ~
                               offset(log(under5population)) + 
                               
                               cf10 + 
                               
                               percent_rural_corrected_ratio + 
                               electricidad_no + 
                               
                               # material_techo_nicer_all + 
                               # material_techo_hormigon_losa_cemento_use + 
                               # material_pared_hormigon_bloque_ladrillo + 
                               # material_piso_nicer + 
                               # material_piso_tierra + 
                               materials_index + 
                               
                               # obtiene_agua_redpublica_tuberia_adentro_use + 
                               # servicio_higenico_redpublica_use + 
                               # elimina_servidas_red_pozo + 
                               elimina_servidas_red_pozo_letrina + 
                               # servicio_ducha_excluso + 
                               # elimina_basura_service + 
                               # hygiene_index + 
                               
                               Literate_women + 
                               InSchool_girls + 
                               Idioma_indigena_self + 
                               
                               # BCG_use + 
                               # DPT3_use + 
                               # SAR_use + 
                               # Polio_use + 
                               pcv3_use + 
                               vaccine_index + 
                               
                               ANC_use +  
                               ANC_visits_median_use |
                               canton_id_universal + 
                                 period
                             ,
                             data=full_df_1)

alri5_glm_nb_1_mod


alri5_glm_qp_nb_estimates <- tidy(alri5_glm_1_mod)[1,] %>% 
  mutate(model = "quasi-Poisson") %>% 
  bind_rows(tidy(alri5_glm_nb_1_mod)[1,] %>% 
              mutate(model = "Negative binomial")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))) 

qp_nb_glm_fig <- ggplot() +
  geom_interval(data=alri5_glm_qp_nb_estimates, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50) +
  geom_interval(data=alri5_glm_qp_nb_estimates, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66) +
  geom_interval(data=alri5_glm_qp_nb_estimates, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95,) +  
  geom_point(data=alri5_glm_qp_nb_estimates, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  
  scale_x_discrete(limits = c("quasi-Poisson", "Negative binomial")) + 
  ggsci::scale_color_lancet() + 
  
  ggtitle("Outcome modeling",
          subtitle="Fully adjusted generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=10, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=12, face="bold"),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"))

alri5_gam_nb_1_mod <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 
                            
                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 
                            
                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 
                            
                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 
                            
                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 
                            
                            ANC_use +  
                            ANC_visits_median_use +
                            
                            as.factor(canton_id_universal) +
                            as.factor(period)
                          ,
                          data=full_df_1,
                          family = nb())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_nb_1_mod, 
  # model.link = "nb", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 

# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_nb <- as.data.frame(bind_cols(fit, lci, uci)) 


nb_estimates_df <- estimate1 %>% 
  mutate(model = "quasi-Poisson") %>% 
  bind_rows(gam_nb_plot_estimates %>% 
              mutate(model = "Negative binomial")) 

crossbasis_fig_qp_nb <- ggplot(table_name_mean %>% 
                               mutate(model = "quasi-Poisson") %>% 
                               bind_rows(table_name_mean_nb %>% 
                                           mutate(model = "Negative binomial")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, group=model, color=model, fill=model)) + 
  geom_ribbon(alpha=0.1, color=NA) + 
  geom_line(size=0.8) + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  # geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  # geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
  #            label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  ggsci::scale_color_lancet() + 
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8, 2.0, 2.2),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to the Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=10, family = "Helvetica", face="bold")) 


histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(fill="dodgerblue4", alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(axis.text = element_text(size=11, family="Helvetica"),
        axis.title = element_text(size=11, family = "Helvetica", face="bold"))  

qp_nb_spline_histogram_plot <- cowplot::plot_grid(
  crossbasis_fig_qp_nb,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.5)
)


qp_nb_spline_histogram_plot


qp_nb_plot_full <- cowplot::plot_grid(
  qp_nb_glm_fig,
  qp_nb_spline_histogram_plot,
  rel_heights = c(0.8, 1),
  nrow=2
  
)


ggsave("~/Dropbox/Ecuador National Health and LPG Analysis/Planning/Figures/qp_nb_plot_full_Dec2021.png",
       qp_nb_plot_full,
       width = 200,
       height = 225,
       units = "mm",
       dpi = 250)


# 3.4 removal of the galapagos -----

galapagos_glm_fig <- ggplot() +
  geom_interval(data=alri5_glm_region_estimates, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi), alpha=0.50, color="dodgerblue2") +
  geom_interval(data=alri5_glm_region_estimates, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi), alpha=0.66, color="dodgerblue3") +
  geom_interval(data=alri5_glm_region_estimates, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi), alpha=0.95, color="dodgerblue4") +  
  geom_point(data=alri5_glm_region_estimates, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  
  scale_x_discrete(limits = c("Full model", "Full model,\ndrop Galapagos",
                              # "Amazonian region", "Andean region", 
                              "Coastal region", "Coastal region,\ndrop Galapagos")) + 
  
  ggtitle("Robustness to the exclusion of the Galapagos islands",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7)) +
  # coord_cartesian(ylim=c(0.6, 1.4)) + 
  # coord_cartesian(ylim=c(-1, 0)) + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=10, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=12, face="bold"))



# Main no galapagos  --------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_main_nog %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_nog <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         ad.cb +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         materials_index + 
                         elimina_servidas_red_pozo_letrina + 
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         as.factor(period) + 
                         as.factor(canton_id_universal)
                       ,
                       data=full_df_main_nog,
                       family = quasipoisson())


pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_nog, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at=full_df_main_nog %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_main_nog <- as.data.frame(bind_cols(fit, lci, uci)) 


# Coast no galapagos ------

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_coast_nog %>% dplyr::select(cf) %>% unlist(),         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_1_coastnog <- gam(ALRI5_5_plus ~
                              offset(log(under5population)) + 
                              
                              ad.cb +
                              
                              percent_rural_corrected_ratio + 
                              electricidad_no + 
                              
                              materials_index + 
                              elimina_servidas_red_pozo_letrina + 
                              Literate_women + 
                              InSchool_girls + 
                              Idioma_indigena_self + 
                              
                              pcv3_use + 
                              vaccine_index + 
                              
                              ANC_use +  
                              ANC_visits_median_use +
                              as.factor(period) + 
                              as.factor(canton_id_universal)
                            ,
                            data=full_df_coast_nog,
                            family = quasipoisson())

pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_coast_nog, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at= full_df_coast_nog %>% dplyr::select(cf) %>% unlist(),         
  bylag = .2474156, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7134 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_coast_nog <- as.data.frame(bind_cols(fit, lci, uci)) 



crossbasis_fig_nogalapagos <- ggplot() + 
  geom_ribbon(data=table_name_main_nog %>% 
                mutate(model = "Full, no Galapagos") %>% 
                bind_rows(table_name_coast_nog %>% 
                            mutate(model = "Coast, no Galapagos")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, 
                                                                        group=model, color=model, fill=model), alpha=0.1, color=NA) + 
  geom_line(data=table_name_main_nog %>% 
              mutate(model = "Full, no Galapagos") %>% 
              bind_rows(table_name_coast_nog %>% 
                          mutate(model = "Coast, no Galapagos")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, 
                                                                      group=model, color=model, fill=model), size=1.2) + 
  
  geom_ribbon(data=table_name_mean %>% 
                mutate(model = "Full") %>% 
                bind_rows(table_name_coast %>% 
                            mutate(model = "Coast")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, 
                                                                        group=model, color=model, fill=model), alpha=0.1, color=NA) + 
  geom_line(data=table_name_mean %>% 
              mutate(model = "Full") %>% 
              bind_rows(table_name_coast %>% 
                          mutate(model = "Coast")), aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, 
                                                                      group=model, color=model, fill=model), size=0.6) + 
  
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  ggsci::scale_color_lancet() + 
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8, 2.0, 2.2),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to the Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=10, family = "Helvetica", face="bold")) 


histogram <- ggplot(full_df_1, aes(x=cf)) +
  geom_histogram(fill="dodgerblue4", alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(axis.text = element_text(size=11, family="Helvetica"),
        axis.title = element_text(size=11, family = "Helvetica", face="bold"))  

no_galapagos_spline_histogram_plot <- cowplot::plot_grid(
  crossbasis_fig_nogalapagos,
  histogram,
  align="hv",
  nrow=2,
  rel_heights = c(1, 0.5)
)


no_galapagos_spline_histogram_plot_full <- cowplot::plot_grid(
  galapagos_glm_fig,
  no_galapagos_spline_histogram_plot,
  rel_heights = c(0.8, 1),
  nrow=2
  
)


ggsave("~/Dropbox/Ecuador National Health and LPG Analysis/Planning/Figures/no_galapagos_spline_histogram_plot_full_Dec2021.png",
       no_galapagos_spline_histogram_plot_full,
       width = 200,
       height = 225,
       units = "mm",
       dpi = 250)





# Province level aggregation --------

# Model 1 -------

prov_df <- 
  full_df_1  %>% 
  group_by(province, period) %>% 
  summarize(
    cf=median(cf, na.rm=T),
    cf10=median(cf10, na.rm=T),
    ALRI5_5_plus=sum(ALRI5_5_plus, na.rm=T),
    under5population=sum(under5population,na.rm=T),
    
    percent_rural_corrected_ratio = mean(percent_rural_corrected_ratio, na.rm=T),
      electricidad_no = mean(electricidad_no, na.rm=T),
      materials_index = mean(materials_index, na.rm=T),
      elimina_servidas_red_pozo_letrina = mean(elimina_servidas_red_pozo_letrina, na.rm=T),
      
      Literate_women = mean(Literate_women, na.rm=T),
      InSchool_girls = mean(InSchool_girls, na.rm=T),
      Idioma_indigena_self = mean(Idioma_indigena_self, na.rm=T),
      
      # BCG_use + 
      # DPT3_use + 
      # SAR_use + 
      # Polio_use + 
      pcv3_use = mean(pcv3_use, na.rm=T),
      vaccine_index = mean(vaccine_index, na.rm=T),
      
      ANC_use = mean(ANC_use, na.rm=T),
      ANC_visits_median_use = median(ANC_visits_median_use, na.rm=T))


prov_df

prov_mod0 <- fixest::feglm(ALRI5_5_plus ~
                offset(log(under5population)) + 
                
                cf10 |
                
                as.factor(province) +
                as.factor(period)
              ,
              data=prov_df,
              family = quasipoisson())


prov_mod1 <- fixest::feglm(ALRI5_5_plus ~
      offset(log(under5population)) + 
      
      cf10 +
      
      percent_rural_corrected_ratio + 
      electricidad_no + 
      
      # material_techo_nicer_all + 
      # material_techo_hormigon_losa_cemento_use + 
      # material_pared_hormigon_bloque_ladrillo + 
      # material_piso_nicer + 
      # material_piso_tierra + 
      materials_index + 
      
      # obtiene_agua_redpublica_tuberia_adentro_use + 
      # servicio_higenico_redpublica_use + 
      # elimina_servidas_red_pozo + 
      elimina_servidas_red_pozo_letrina + 
      # servicio_ducha_excluso + 
      # elimina_basura_service + 
      # hygiene_index + 
      
      Literate_women + 
      InSchool_girls + 
      Idioma_indigena_self + 
      
      # BCG_use + 
      # DPT3_use + 
      # SAR_use + 
      # Polio_use + 
      pcv3_use + 
      vaccine_index + 
      
      ANC_use + 
      ANC_visits_median_use |
      
      as.factor(province) +
      as.factor(period)
    ,
    data=prov_df,
    family = quasipoisson())


prov_mod_df <- 
  tidy(prov_mod0)  %>% 
  mutate(model="Empty Model")  %>% 
  bind_rows(tidy(prov_mod1)[1,] %>% 
              mutate(model="Preferred Specification")) %>% 
  mutate(
  irr = exp(estimate),
  int66_hi = exp(estimate + (0.412*std.error)),
  int66_low = exp(estimate - (0.412*std.error)),
  int80_hi = exp(estimate + (0.842*std.error)),
  int80_low = exp(estimate - (0.842*std.error)),
  int95_hi = exp(estimate + (1.95996*std.error)),
  int95_low = exp(estimate - (1.95996*std.error))) 

prov_alternative_specifications_glm <- ggplot() +
  geom_interval(data=prov_mod_df, 
                aes(x=model, y=irr, ymin=int95_low, ymax=int95_hi, color=model), alpha=0.50, size = 2.5) +
  geom_interval(data=prov_mod_df, 
                aes(x=model, y=irr, ymin=int80_low, ymax=int80_hi, color=model), alpha=0.66, size = 3) +
  geom_interval(data=prov_mod_df, 
                aes(x=model, y=irr, ymin=int66_low, ymax=int66_hi, color=model), alpha=0.95, size = 3.5) +  
  geom_point(data=prov_mod_df, 
             aes(x=model, y=irr), size=2, shape=18, color="white") + 
  ggsci::scale_color_lancet() + 
  
  ggtitle("Province-level analysis",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio (66%, 80%, 95% CI)\nper 10 p.p. increase in %CF") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.6, 0.7, 0.8, 0.9, 1.0)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  # scale_color_viridis_d() + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=12, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=13, face="bold"),
        plot.title = element_text(size=14, face="bold"),
        legend.position="top",
        legend.title = element_blank(),
        legend.text=element_text(size=10, color="black"))

prov_alternative_specifications_glm

prov_alri5_gam_1_mod <- gam(ALRI5_5_plus ~
                         offset(log(under5population)) + 
                         
                         s(cf, k=4, fx=T) +
                         
                         percent_rural_corrected_ratio + 
                         electricidad_no + 

                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 

                         # obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 

                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 

                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 

                         ANC_use + 
                         ANC_visits_median_use +
                         
                         as.factor(province) +
                         as.factor(period)
                       ,
                       data=prov_df,
                       family = quasipoisson())


plot(prov_alri5_gam_1_mod)

ad.cb <- crossbasis(
  # the column with the exposure 
  x=prov_df$cf,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)



## model using crossbasis

alri5_gam_1_mod_cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            percent_rural_corrected_ratio + 
                            electricidad_no + 

                            # material_techo_nicer_all + 
                            # material_techo_hormigon_losa_cemento_use + 
                            # material_pared_hormigon_bloque_ladrillo + 
                            # material_piso_nicer + 
                            # material_piso_tierra + 
                            materials_index + 

                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                            # servicio_higenico_redpublica_use + 
                            # elimina_servidas_red_pozo + 
                            elimina_servidas_red_pozo_letrina + 
                            # servicio_ducha_excluso + 
                            # elimina_basura_service + 
                            # hygiene_index + 

                            Literate_women + 
                            InSchool_girls + 
                            Idioma_indigena_self + 

                            # BCG_use + 
                            # DPT3_use + 
                            # SAR_use + 
                            # Polio_use + 
                            pcv3_use + 
                            vaccine_index + 

                            ANC_use + 
                            ANC_visits_median_use +
                            
                            as.factor(province) +
                            as.factor(period)
                          ,
                          data=prov_df,
                          family = quasipoisson())

prov_alri5_gam_0_mod_cb <- gam(ALRI5_5_plus ~
                            offset(log(under5population)) + 
                            
                            ad.cb +
                            
                            as.factor(province) +
                            as.factor(period)
                          ,
                          data=prov_df,
                          family = quasipoisson())



# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = prov_df$cf, # exposure
  bylag = .22976, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7204 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean <- as.data.frame(bind_cols(fit, lci, uci)) 


# Relative to the mean ------
prov_pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = prov_alri5_gam_0_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = prov_df$cf, # exposure
  bylag = .22976, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.7204 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(prov_pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(prov_pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(prov_pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
prov_table_name_mean <- as.data.frame(bind_cols(fit, lci, uci)) 

prov_table_name_mean_comb <- 
  prov_table_name_mean %>% 
  mutate(model = "Empty Model") %>% 
  bind_rows(
    table_name_mean %>% 
      mutate(model = "Preferred Specification")
  )

prov_crossbasis_fig_main <- 
  ggplot(prov_table_name_mean_comb, 
         aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0, color=model, fill=model)) + 
  geom_ribbon(alpha=0.1, color=NA) + 
  geom_line(size=0.8) + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  scale_fill_aaas() +
  scale_color_aaas() + 
  # geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  # geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
  #            label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.5, 1, 1.5, 2, 2.5, 3)) +
  coord_cartesian(ylim=c(0.5, 3.0)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative\nto the Mean\n(71% CF)") +
  theme(legend.position = "none")


histogram <- 
  ggplot(prov_df, aes(x=cf)) +
  geom_histogram(fill="dodgerblue4", alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of provinces") 

crossbasis_fig_main_hist_prov <- cowplot::plot_grid(
  prov_crossbasis_fig_main,
  histogram,
  align="hv",
  rel_heights = c(1, 0.5),
  nrow=2
)

crossbasis_fig_main_hist_prov


plot_grid(
  prov_alternative_specifications_glm,
  crossbasis_fig_main_hist_prov,
  align="hv",
  nrow=2
)


# Relative to the 45% ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = prov_df$cf, # exposure
  bylag = 0.1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.45 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_45 <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_30 <- ggplot(table_name_30, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
             label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to 30% CF") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 

# Relative to 75% ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_cb, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = full_df_1$cf, # exposure
  bylag = 0.1, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.75 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_75 <- as.data.frame(bind_cols(fit, lci, uci)) 

crossbasis_fig_75 <- ggplot(table_name_75, aes(x=CF, y=rr_lag0, ymin=lci_lag0, ymax=uci_lag0)) + 
  geom_ribbon(alpha=0.1, fill="dodgerblue3", color=NA) + 
  geom_line(size=0.8, color="dodgerblue4") + 
  geom_hline(yintercept = 1, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  geom_vline(xintercept=0.61, linetype=2, size=0.8) + 
  # annotate("text", x=0.725, y=1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)", size=4) +
  geom_label(aes(x = 0.725, y = 1.7, label = "Threshold: 61%\n(95% CI, 52%-70%)"), 
             label.size = NA, fill = "white") + 
  # coord_cartesian(ylim=c(-.7, 1)) +
  # geom_hline(yintercept = 0, linetype = 2, size=0.3) + 
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
                              1.2, 1.4,  1.6, 1.8),
                     minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9)) +
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to 75% CF") + 
  theme(axis.text = element_text(size=12, family="Helvetica", color="black"),
        axis.title.x=element_blank(),
        legend.position = "none",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=13, family = "Helvetica", face="bold")) 


# Region fixed effect -------

full_df_1 <- full_df_1 %>% 
  mutate(region = 
           ifelse(province=="19" |
           province=="14" |
           province=="16" |
           province=="15" |
           province == "21", "Amazon",
           ifelse(province=="11" |
                    province=="01" |
                    province == "10" |
                    province=="06" |
                    province=="02" |
                    province == "03" | # canar
                    province=="05" |
                    province=="17" |
                    province=="04", "Andes", 
                  ifelse(province=="07" |
           province=="09" |
           province=="12" |
           province=="13" |
           province == "08" | # esmeraldas
           province == "18" | # tungurahua
           province=="20", "Coast", NA)))) %>% 
  mutate(region_f = factor(region, levels=c("Coast", "Andes", "Amazon")))

alri5_gam_1_mod_regionfe <- 
  gam(ALRI5_5_plus ~
        offset(log(under5population)) + 
        
        s(cf, by=region_f) +
        
        percent_rural_corrected_ratio + 
                         electricidad_no + 
                         
                         # material_techo_nicer_all + 
                         # material_techo_hormigon_losa_cemento_use + 
                         # material_pared_hormigon_bloque_ladrillo + 
                         # material_piso_nicer + 
                         # material_piso_tierra + 
                         materials_index + 
                         
                         # obtiene_agua_redpublica_tuberia_adentro_use + 
                         # servicio_higenico_redpublica_use + 
                         # elimina_servidas_red_pozo + 
                         elimina_servidas_red_pozo_letrina + 
                         # servicio_ducha_excluso + 
                         # elimina_basura_service + 
                         # hygiene_index + 
                         
                         Literate_women + 
                         InSchool_girls + 
                         Idioma_indigena_self + 
                         
                         # BCG_use + 
                         # DPT3_use + 
                         # SAR_use + 
                         # Polio_use + 
                         pcv3_use + 
                         vaccine_index + 
                         
                         ANC_use +  
                         ANC_visits_median_use +
                         
                        # as.factor(region) + 
                         as.factor(canton_id_universal) +
                         as.factor(period)
                       ,
                       data=full_df_1,
                       family = quasipoisson())

plot(alri5_gam_1_mod_regionfe)

main_model_region_interact <- 
  fixest::feglm(
    ALRI5_5_plus ~
      offset(log(under5population)) + 
        
        cf10*region_f +
      
      percent_rural_corrected_ratio + 
      electricidad_no + 
      
      
      materials_index + 
      
      # obtiene_agua_redpublica_tuberia_adentro_use + 
      # servicio_higenico_redpublica_use + 
      # elimina_servidas_red_pozo + 
      elimina_servidas_red_pozo_letrina + 
      # servicio_ducha_excluso + 
      # elimina_basura_service + 
      # hygiene_index + 
      
      Literate_women + 
      InSchool_girls + 
      Idioma_indigena_self + 
      
      # BCG_use + 
      # DPT3_use + 
      # SAR_use + 
      # Polio_use + 
      pcv3_use + 
      vaccine_index + 
      
      ANC_use +  
      ANC_visits_median_use 
      |
      # as.factor(region) + 
        canton_id_universal +
      as.factor(period),
    data=full_df_1,
    family = quasipoisson())


region_fe_linear_coast <- 
  tidy(main_model_region_interact)[1,]

region_fe_linear_amazon <- 
  tidy(main_model_region_interact)[1,]

region_fe_linear_andes <- 
  tidy(main_model_region_interact)[1,]


region_fe_linear_df <- 
  region_fe_linear_coast %>% 
  bind_rows(region_fe_linear_andes, 
            region_fe_linear_amazon) %>% 
  mutate(term = c("Coast", "Andes", "Amazon")) %>% 
  mutate(
    irr = exp(estimate),
    int66_hi = exp(estimate + (0.412*std.error)),
    int66_low = exp(estimate - (0.412*std.error)),
    int80_hi = exp(estimate + (0.842*std.error)),
    int80_low = exp(estimate - (0.842*std.error)),
    int95_hi = exp(estimate + (1.95996*std.error)),
    int95_low = exp(estimate - (1.95996*std.error))
    ) 

region_fe_linear_fig <- 
  ggplot() +
  geom_interval(data=region_fe_linear_df, 
                aes(x=term, y=irr, ymin=int95_low, ymax=int95_hi, color=term), alpha=0.50, size = 2.5) +
  geom_interval(data=region_fe_linear_df, 
                aes(x=term, y=irr, ymin=int80_low, ymax=int80_hi, color=term), alpha=0.66, size = 3) +
  geom_interval(data=region_fe_linear_df, 
                aes(x=term, y=irr, ymin=int66_low, ymax=int66_hi, color=term), alpha=0.95, size = 3.5) +  
  geom_point(data=region_fe_linear_df, 
             aes(x=term, y=irr), size=2, shape=18, color="white") + 
  ggsci::scale_color_lancet() + 
  
  ggtitle("Regional interaction plot",
          subtitle = "Quasi-Poisson generalized linear models") + 
  ylab("Mortality rate ratio\n(66%, 80%, 95% CI)\nper 10 p.p. increase in %CF") + xlab("") + 
  
  geom_hline(yintercept=1) + 
  scale_y_continuous(breaks=c(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2)) +
  # coord_cartesian(ylim=c(-1, 0)) + 
  # scale_color_viridis_d() + 
  theme_bw() + 
  theme(axis.text.y = element_text(size=12, color="black"),
        axis.text.x = element_text(size=10, color="black"),
        axis.title = element_text(size=10, face="bold"),
        plot.title = element_text(size=14, face="bold"),
        legend.position="top",
        legend.title = element_blank(),
        legend.text=element_text(size=10, color="black"))

n linear plots -------
p0_region1 <- plot(alri5_gam_1_mod_regionfe, shade=TRUE, select=2)

estimate0_region1 <- unlist(p0_region1[[2]]$fit) %>% 
  bind_cols(p0_region1[[2]]$se) %>% 
  bind_cols(p0_region1[[2]]$x) %>% 
  rename(cf = `...3`,
         fit = `...1`,
         se = `...2`) %>% 
  mutate(fit.high = fit + se,
         fit.low = fit - se) %>% 
  mutate(region="Coast")

p0_region2 <- plot(alri5_gam_1_mod_regionfe, shade=TRUE, select=3)

estimate0_region2 <- unlist(p0_region2[[3]]$fit) %>% 
  bind_cols(p0_region2[[3]]$se) %>% 
  bind_cols(p0_region2[[3]]$x) %>% 
  rename(cf = `...3`,
         fit = `...1`,
         se = `...2`) %>% 
  mutate(fit.high = fit + se,
         fit.low = fit - se) %>% 
  mutate(region="Andes")

p0_region3 <- plot(alri5_gam_1_mod_regionfe, shade=TRUE, select=1)

estimate0_region3 <- unlist(p0_region3[[1]]$fit) %>% 
  bind_cols(p0_region3[[1]]$se) %>% 
  bind_cols(p0_region3[[1]]$x) %>% 
  rename(cf = `...3`,
         fit = `...1`,
         se = `...2`) %>% 
  mutate(fit.high = fit + se,
         fit.low = fit - se) %>% 
  mutate(region="Amazon")


estimate0_regions <- 
  estimate0_region3 %>% 
  bind_rows(estimate0_region1, estimate0_region2)

region_fe_nonlinear_fig <- 
  ggplot(estimate0_regions, 
         aes(x=cf, y=fit, ymin=fit.low, ymax=fit.high, color=region, fill=region, group=region)) +
  geom_ribbon(alpha=0.1, color=NA) + 
  geom_line(size=0.8) + 
  geom_hline(yintercept = 0, size=0.5, color="grey20", linetype=3) + 
  theme_bw() + 
  ggsci::scale_color_lancet() + 
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) +
  # scale_y_continuous(breaks=c(0.4, 0.6,  0.8, 1,
  #                             1.2, 1.4,  1.6, 1.8, 2.0, 2.2),
  #                    minor_breaks = c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1)) +
  xlab("Clean fuel use") + 
  ylab("Response of under-5 LRI\nmortality relative to the Mean\n(71% CF)") + 
  theme(legend.position = "none",
        axis.title.y = element_text(size=10))  



region_fe_linear_fig

region_histogram <- ggplot(full_df_1, aes(x=cf, group=region, fill=region)) +
  geom_histogram(alpha=0.8, color="black", bins = 50) + 
  theme_bw() +
  ggsci::scale_fill_lancet() + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + ylab("Count of cantons") +
  theme(legend.position = "none",
        axis.title.y = element_text(size=10))  

region_fe_figure <- cowplot::plot_grid(
  region_fe_linear_fig,
  region_fe_nonlinear_fig,
  region_histogram,
  align="hv",
  rel_heights = c(1, 1, 0.5),
  nrow=3
)

region_fe_figure

# Remove Quito and Guayaquil -------

# Linear models ------
alri5_glm_empty_mod_noqg <- fixest::feglm(ALRI5_5_plus ~
                                  offset(log(under5population)) + 
                                  
                                  cf10 |
                                  
                                  as.factor(canton_id_universal) +
                                  as.factor(period)
                                ,
                                data=full_df_1 %>% 
                                  filter(canton_id_universal!="09_01") %>% 
                                  filter(canton_id_universal!="17_01"),
                                family = quasipoisson())

tidy(alri5_glm_empty_mod_noqg)

alri5_glm_1_mod_noqg <- fixest::feglm(ALRI5_5_plus ~
                                            offset(log(under5population)) + 
                                            
                                            cf10 +
                                        
                                            percent_rural_corrected_ratio + 
                                            electricidad_no + 
                                            
                                            # material_techo_nicer_all + 
                                            # material_techo_hormigon_losa_cemento_use + 
                                            # material_pared_hormigon_bloque_ladrillo + 
                                            # material_piso_nicer + 
                                            # material_piso_tierra + 
                                            materials_index + 
                                            
                                            # obtiene_agua_redpublica_tuberia_adentro_use + 
                                            # servicio_higenico_redpublica_use + 
                                            # elimina_servidas_red_pozo + 
                                            elimina_servidas_red_pozo_letrina + 
                                            # servicio_ducha_excluso + 
                                            # elimina_basura_service + 
                                            # hygiene_index + 
                                            
                                            Literate_women + 
                                            InSchool_girls + 
                                            Idioma_indigena_self + 
                                            
                                            # BCG_use + 
                                            # DPT3_use + 
                                            # SAR_use + 
                                            # Polio_use + 
                                            pcv3_use + 
                                            vaccine_index + 
                                            
                                            ANC_use +  
                                            ANC_visits_median_use |
                                            
                                            as.factor(canton_id_universal) +
                                            as.factor(period)
                                          ,
                                          data=full_df_1 %>% 
                                            filter(canton_id_universal!="09_01") %>% 
                                            filter(canton_id_universal!="17_01"),
                                          family = quasipoisson())

tidy(alri5_glm_1_mod_noqg)

# Empty model -------

full_df_1_noqg <- 
  full_df_1 %>% 
  filter(canton_id_universal!="09_01") %>% 
  filter(canton_id_universal!="17_01")

ad.cb <- crossbasis(
  # the column with the exposure 
  x=full_df_1_noqg$cf,         
  # The number of lags
  # the 0th lag is the same day. 
  lag=0,    
  # the functional form of the constraint on the exposure dimension
  argvar=list(fun="bs", df = 3)) # I use the df I found in the test model (above)


alri5_gam_empty_mod_noqg <- gam(ALRI5_5_plus ~
                                  offset(log(under5population)) + 
                                  
                                  ad.cb +
                                  
                                  as.factor(canton_id_universal) +
                                  as.factor(period)
                                ,
                                data=full_df_1 %>% 
                                  filter(canton_id_universal!="09_01") %>% 
                                  filter(canton_id_universal!="17_01"),
                                family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_empty_mod_noqg, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = prov_df$cf, # exposure
  bylag = .2475, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.711 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_empty_noqg <- as.data.frame(bind_cols(fit, lci, uci)) 


# Model 1 -------

alri5_gam_1_mod_noqg <- gam(ALRI5_5_plus ~
                              offset(log(under5population)) + 
                              
                              ad.cb +
                              
                              percent_rural_corrected_ratio + 
                              electricidad_no + 
                              
                              # material_techo_nicer_all + 
                              # material_techo_hormigon_losa_cemento_use + 
                              # material_pared_hormigon_bloque_ladrillo + 
                              # material_piso_nicer + 
                              # material_piso_tierra + 
                              materials_index + 
                              
                              # obtiene_agua_redpublica_tuberia_adentro_use + 
                              # servicio_higenico_redpublica_use + 
                              # elimina_servidas_red_pozo + 
                              elimina_servidas_red_pozo_letrina + 
                              # servicio_ducha_excluso + 
                              # elimina_basura_service + 
                              # hygiene_index + 
                              
                              Literate_women + 
                              InSchool_girls + 
                              Idioma_indigena_self + 
                              
                              # BCG_use + 
                              # DPT3_use + 
                              # SAR_use + 
                              # Polio_use + 
                              pcv3_use + 
                              vaccine_index + 
                              
                              ANC_use +  
                              ANC_visits_median_use +
                              
                              as.factor(canton_id_universal) +
                              as.factor(period)
                            ,
                            data=full_df_1 %>% 
                              filter(canton_id_universal!="09_01") %>% 
                              filter(canton_id_universal!="17_01"),
                            family = quasipoisson())

# Relative to the mean ------
pred.est <- crosspred(
  # the exposure crossbasis
  basis = ad.cb,   
  # the model
  model = alri5_gam_1_mod_noqg, 
  model.link = "log", # in my case is log because my model is a quasipoisson
  at = prov_df$cf, # exposure
  bylag = .248, # estimate association in increments of x. You can chose whatever increment you have. A good increment value would be the SD
  cen = 0.711 # this is where your specify your reference point, which in our case should be the exposure mean
) 


# 3d.i Extract fit
fit <- as.data.frame(pred.est$matRRfit)  
colnames(fit) <- paste0("rr_", colnames(fit))
fit <- fit %>% mutate(CF = as.numeric(row.names(fit)))

# 3d.ii Extract 95% CI  
lci <- as.data.frame(pred.est$matRRlow)  
colnames(lci) <- paste0("lci_", colnames(lci))

uci <- as.data.frame(pred.est$matRRhigh)  
colnames(uci) <- paste0("uci_", colnames(uci))

# 3d.iii Combine fit and se 
table_name_mean_1_noqg <- as.data.frame(bind_cols(fit, lci, uci)) 


noqg_cb_estimates_df <- table_name_mean_empty_noqg %>% 
  mutate(model = "Empty Model\n(No Quito or Guayquil)") %>% 
  bind_rows(table_name_mean_1_noqg %>% 
              mutate(model = "Preferred Specification\n(No Quito or Guayquil)")) %>% 
  rename(cf = CF,
         fit = rr_lag0,
         fit.low = lci_lag0,
         fit.high = uci_lag0)

noqg_cb_estimates_fig <- ggplot() + 
  geom_ribbon(data=noqg_cb_estimates_df,
              aes(x=cf, y=fit,
                  ymin=fit.low, ymax=fit.high, group=model, color=model), 
              alpha=0.1, fill="dodgerblue3", color=NA) +
  geom_line(data=noqg_cb_estimates_df,
            aes(x=cf, y=fit, group=model, color=model), 
            size=0.8) +
  ggsci::scale_color_lancet() + 
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, size=0.3) +
  coord_cartesian(ylim=c(0,2.1)) +
  ggtitle("Models without Quito or Guayaquil", 
          subtitle="Quasi-Poisson generalized additive mixed models") + 
  scale_x_continuous(labels=scales::percent_format()) + 
  xlab("Clean fuel use") + 
  ylab("Mortality rate ratio relative to Overall Mean\n(71% CF)") + 
  theme(axis.text = element_text(size=10, family="Helvetica"),
        axis.title.x=element_blank(),
        legend.position = "bottom",
        legend.title = element_blank(),
        legend.text = element_text(size=10, family="Helvetica"),
        axis.title.y = element_text(size=11, family = "Helvetica", face="bold"))  


noqg_cb_estimates_fig
